# Project 1

### Initialise
---

#### import needed module

In [1]:
import numpy as np
from collections import Counter
from collections import defaultdict
from ipywidgets import *
from tqdm import tqdm_notebook, tqdm
from aer import *
import pickle
from copy import deepcopy
import os
from enum import Enum
from scipy.special import digamma
from scipy.special import gammaln
from random import random

import mmap

#### create supporting functions
---

In [2]:
def create_pairs_and_update_files(file_f,file_e,null='<NULL>'):
    """
    given a french and an english file, it will loop over the pairs 
    of lines. For every occuring combination in a line, a pair is 
    added to the dict. 
    """
    print("creating pairs")
    counter_e = Counter()
    counter_f = Counter()
    for line_num, (line_f, line_e) in tqdm(enumerate(zip(file_f,file_e)),total=len(file_f), desc='Count Words', leave=True):
        words_f = line_f.split()
        words_e = line_e.split()
        file_f[line_num] = words_f
        file_e[line_num] = words_e
        
        for word_f in words_f:
            counter_f[word_f] += 1
            for word_e in words_e:
                counter_e[word_e] += 1
                    
    fe_pairs = dict()
    vocab_e = set()
    vocab_f = set()
    fe_pairs[('<LOW>','<LOW>')] = 1
    for line_num, (line_f, line_e) in tqdm(enumerate(zip(file_f,file_e)),total=len(file_f), desc='Pairs', leave=True):
        c = 0
        for i,f in enumerate(line_f):
            if counter_f[f] == 1:
                file_f[line_num][i] = '<LOW>'
                f = '<LOW>'
            fe_pairs[(f, null)] = 1
            if f not in vocab_f:
                vocab_f.update([f])
            for j,e in enumerate(line_e):
                if not c and e != '<LOW>' and e != null and counter_e[e] == 1:
                    file_e[line_num][j] = '<LOW>'
                    e = '<LOW>'
                fe_pairs[(f,e)] = 1
                if not c and e not in vocab_e:
                    vocab_e.update([e])
            c += 1

    return fe_pairs, vocab_f, vocab_e

def update_files(vocab_f,vocab_e,file_f,file_e,null='<NULL>'):
    """
    given a french and an english file and the vocabularies, 
    it will update the files with non occuring 
    """
    for line_num, (line_f, line_e) in enumerate(zip(file_f,file_e)):
        words_f = line_f.split()
        words_e = line_e.split()
        for i,f in enumerate(words_f):
            if f not in vocab_f:
                words_f[i] = '<LOW>'
        for i,e in enumerate(words_e):
            if e not in vocab_e:
                words_e[i] = '<LOW>'
        file_f[line_num] = words_f
        file_e[line_num] = words_e

Different enum classes, that can are used to set certain hyperparameters for training the model. 

In [3]:
class IBM_model(Enum):
    I = 1
    II = 2
    
class Initialization_type(Enum):
    uniform = 1
    random = 2
    modelI = 3
    
class Termination_type(Enum):
    epochs = 1
    perplexity_convergence = 2

### Metrics calculation
---

In [4]:
def calculate_perplexities(model,t,jump_dist,max_jump,file_f_train,file_e_train,file_f_val,file_e_val,calc_LL_train,use_VB=False,vb_alpha=0.1,null='<NULL>'):
    """
    Given the model, it determines which perplexity calculation should be done. 
    It calculates the perplexity for both the training and the validation data. 
    """
    train_log_likelihood = -1 
    train_perplexity = -1
    if calc_LL_train:
        train_log_likelihood, train_perplexity = calculate_perplexity(model,
                                                                     t,
                                                                     jump_dist,
                                                                     max_jump,
                                                                     file_f_train,
                                                                     file_e_train,
                                                                     use_VB,
                                                                     vb_alpha,
                                                                     null=null)        
    val_log_likelihood, val_perplexity = calculate_perplexity(model,
                                                             t,
                                                             jump_dist,
                                                             max_jump,
                                                             file_f_val,
                                                             file_e_val,
                                                             use_VB,
                                                             null=null)
    
    return train_log_likelihood, val_log_likelihood, train_perplexity, val_perplexity

In [5]:
def calculate_perplexity(model,t,jump_dist, max_jump,file_f,file_e,use_VB=False,vb_alpha=0.1,null='<NULL>'):
    if model == IBM_model.I:
        log_likelihood,N = calculate_log_likelihood_modelI(t, file_f, file_e, use_VB, vb_alpha, null)
    else:
        log_likelihood,N = calculate_log_likelihood_modelII(t, file_f, file_e, jump_dist, max_jump, null)
    
    perplexity = np.exp(-1*log_likelihood/N)
    return log_likelihood,perplexity

In [6]:
class Metrics_tracker:
    def __init__(self,
                 save_prefix, 
                 align_path, 
                 validation_truth, 
                 validation_file_f, 
                 validation_file_e,
                 train_file_f,
                 train_file_e,
                 test_truth,
                 test_file_f,
                 test_file_e,
                 vocab_f,
                 vocab_e,
                 calc_LL_train,
                 file_enc):
              
        self.save_prefix = save_prefix
        self.align_path = align_path
        self.validation_truth = validation_truth
        self.validation_file_f = validation_file_f
        self.validation_file_e = validation_file_e
        self.train_file_f = train_file_f
        self.train_file_e = train_file_e
        self.test_truth = test_truth
        self.test_file_f = test_file_f
        self.test_file_e = test_file_e
        self.vocab_f = vocab_f
        self.vocab_e = vocab_e
        self.calc_LL_train = calc_LL_train
        self.file_enc = file_enc
        
        # Track
        self.val_aers = []
        self.train_log_likelihoods = []
        self.val_log_likelihoods = []
        self.train_perplexities = []
        self.val_perplexities = []
        self.test_aer = None
    
    def track_metrics(self, epoch, model, t, jump_dist=None, max_jump=None, use_VB=False,vb_alpha=0.1):
        aer = self.calculate_aer_validation(epoch, model, t, jump_dist, max_jump)
        train_ll, val_ll, train_pp, val_pp = self.calculate_perplexities(model, t, jump_dist, max_jump, use_VB,vb_alpha)
        
        # Store
        self.val_aers.append(aer)
        self.train_log_likelihoods.append(train_ll)
        self.val_log_likelihoods.append(val_ll)
        self.train_perplexities.append(train_pp)
        self.val_perplexities.append(val_pp)
        
    def print_last_metrics(self, epoch = None, aer=True,train_ll=True,val_ll=True,train_pp=True,val_pp=True):
        if epoch == None:
            epoch = len(val_aers)
        print('Results Epoch: '+str(epoch))
        print('====================')
        if aer and self.val_aers:
            print('AER:\n\t validation:\t{0}'.format(self.val_aers[-1]))
        if train_ll or val_ll:
            print('Log Likelihood:')
            if train_ll and self.train_log_likelihoods:
                print('\t train:\t\t{0}'.format(self.train_log_likelihoods[-1]))
            if val_ll and self.val_log_likelihoods:
                print('\t validation:\t{0}'.format(self.val_log_likelihoods[-1]))
        if train_pp or val_pp:
            print('Perplexity:')
            if train_pp and self.train_perplexities:
                print('\t train:\t\t{0}'.format(self.train_perplexities[-1]))
            if val_ll and self.val_perplexities:
                print('\t validation:\t{0}'.format(self.val_perplexities[-1]))
    
    def save_metrics(self, file_name = 'metrics.p'):
        metrics = {'train_log_likelihoods': self.train_log_likelihoods,
                   'val_log_likelihoods': self.val_log_likelihoods,
                   'train_perplexities': self.train_perplexities,
                   'val_perplexities': self.val_perplexities,
                   'val_aers': self.val_aers,
                   'test_aer': self.test_aer}
        pickle.dump(metrics, open(file_name, "wb"))
    
    def calculate_aer_validation(self, epoch, model, t, jump_dist=None, max_jump=None):
        return self.calculate_aer(epoch, model, t, self.validation_file_f, self.validation_file_e, self.validation_truth, jump_dist, max_jump)
        
    def calculate_aer_test(self, epoch, model, t, jump_dist=None, max_jump=None):
        self.test_aer = self.calculate_aer(epoch, model, t, self.test_file_f, self.test_file_e, self.test_truth, jump_dist, max_jump)
        return self.test_aer
    
    def calculate_aer(self, epoch, model, t, file_f, file_e, file_truth, jump_dist, max_jump):
        align_file = os.path.join(self.align_path,'{0}validation_epoch{1}.align'.format(self.save_prefix, epoch))
        if model == IBM_model.I:
            create_alignments_modelI(t, file_f, file_e, align_file, self.file_enc)
        elif model == IBM_model.II:
            create_alignments_modelII(t, jump_dist, max_jump, file_f, file_e, align_file, self.file_enc)

        aer = test(file_truth, align_file)
        return aer
    
    def calculate_perplexities(self, model, t, jump_dist, max_jump, use_VB=False,vb_alpha=0.1):
        train_ll, val_ll, train_pp, val_pp = calculate_perplexities(model,
                                                                    t,
                                                                    jump_dist,
                                                                    max_jump,
                                                                    self.train_file_f,
                                                                    self.train_file_e,
                                                                    self.validation_file_f,
                                                                    self.validation_file_e,
                                                                    self.calc_LL_train,
                                                                    use_VB,
                                                                    vb_alpha)

        return train_ll, val_ll, train_pp, val_pp

# IBM I
---

In [7]:
def init_params_modelI(initial_method, pairs, null='<NULL>'):
    # Returns: t[(f,e)] - translation probabilities
    
    assert initial_method == Initialization_type.uniform, 'Unsupported initalization method {} for IBM model I'.format(initial_method)
    
    e_vocab_size = sum(1 for k,v in tqdm(pairs,  desc='Init Norm', leave=True) if v != null)
    t = dict(zip(pairs,[1.0/e_vocab_size]*len(pairs)))    
    return t

In [8]:
# Train
'''
E-step:
    for each word j in french sentence:
        the probability of fj|ei divided by (for t=0>m: fj|et)
        
M-step:
    E[fe]/E[e]
'''
def em_step_modelI(t, file_f, file_e, use_VB, alpha):
    num_lines = len(file_f)
    
    # Set to zero
    cooccurrences = defaultdict(float) # count words e and f happen together
    total_f = defaultdict(float) # count word f happens
    counter_f = Counter()
    total_e = defaultdict(float) # count word e happens
    
    for f_sentence, e_sentence in tqdm(zip(file_f,file_e),total=num_lines,  desc='E-step', leave=True):
        for e in e_sentence:
            total_e[e] = 0
            for f in f_sentence:
                total_e[e] += t[(f,e)]
                counter_f[f] += 1

        for e in e_sentence:
            for f in f_sentence:
                temp = t[(f,e)] / total_e[e]
                cooccurrences[(f,e)] += temp
                total_f[f] += temp

    for f,e in tqdm(cooccurrences.keys(),  desc='M-Step', leave=True):
        if use_VB:
            #theta_f|e =  exp( digamma(lambda_f|e) - digamma(sum_f' lambda(f'|e))) where lambda_f|e = E(#f-e)+alpha_f   
            t[(f,e)] = np.exp( digamma(cooccurrences[(f,e)] + alpha) - digamma(total_f[f] + counter_f[f]*alpha))
        else:
            t[(f,e)] = cooccurrences[(f,e)] / total_f[f]
        
    return t

In [9]:
def create_alignments_modelI(t, file_f, file_e, target, file_enc='utf-8'):
    # open file to write to
    with open(target,'w',encoding=file_enc) as tar:
        # for each sentence in list
        for line_num, (f_sentence,e_sentence) in tqdm(enumerate(zip(file_f,file_e)),total=len(file_f),  desc='AlignI', leave=True):
            # for each word in sentence, find the best alignment
            for ind_f,f in enumerate(f_sentence):
                ind_f += 1 #0 is reserved for null
                max_ind_e = 0 #when no alignment is found, align to zero
                max_p = 0
                for ind_e,e in enumerate(e_sentence):
                    ind_e += 1 #0 is reserved for null
                    if (f,e) in t:
                        if t[(f,e)] > max_p:
                            max_p = t[(f,e)]
                            max_ind_e = ind_e

                if max_ind_e != 0: # Skip null alignments
                    # write to file. Output: sentence_line english_pos french_pos probability
                    tar.write('%d %d %d P %f\n'%(line_num, max_ind_e, ind_f, max_p)) 

In [10]:
def calculate_log_likelihood_modelI(t, file_f, file_e, use_VB=False, vb_alpha=0.1, null='<NULL>'):
    log_likelihood = 0
    N = 0
    # Set to zero
    cooccurrences = defaultdict(float) # count words e and f happen together
    total_f = defaultdict(float) # count word f happens
    counter_f = Counter()
    total_e = defaultdict(float) # count word e happens

    for sentence_f, sentence_e in tqdm(zip(file_f,file_e),total=len(file_f),  desc='Calc LL', leave=True):
        l = len(sentence_e)
        m = len(sentence_f)
        
        if use_VB:
            for e in sentence_e:
                total_e[e] = 0
                for f in sentence_f:
                    if (f,e) in t:
                        total_e[e] += t[(f,e)]
                    counter_f[f] += 1

            for e in sentence_e:
                for f in sentence_f:
                    if (f,e) in t:
                        temp = t[(f,e)] / total_e[e]
                        cooccurrences[(f,e)] += temp
                        total_f[f] += temp

        #part1
        alignment_prob = -np.log(m*np.log(l+1)) #+ np.log(-m)
        sentence_e = [null] + sentence_e
        for f in sentence_f:
            max_p = 0
            sum_p = 0
            for e in sentence_e:
                if (f,e) in t:
                    sum_p += t[(f,e)]
                    if t[(f,e)] > max_p:
                        max_p = t[(f,e)]        
            N += 1
            log_likelihood += alignment_prob + np.log(max_p)

    #part2
    if use_VB:
        for f,e in tqdm(cooccurrences.keys(),  desc='part2_elbo', leave=True):
            lambda_fe = cooccurrences[(f,e)] + alpha
            exp_log_theta_fe = digamma(lambda_fe) - digamma(total_f[f] + counter_f[f]*vb_alpha)
            log_likelihood += exp_log_theta_fe + gammaln(lambda_fe) - gammaln(vb_alpha)
    
    return log_likelihood, N


# IBM II
---

In [11]:
def init_params_modelII(initial_method, pairs, max_jump, t=None, null='<NULL>'):
    # Returns: t[(f,e)] and jump_dist
    
    if t == None:
        if initial_method == Initialization_type.uniform:
            t = init_params_modelI(initial_method, pairs)
        elif initial_method == Initialization_type.random:
            t = dict(zip(pairs,[random() for x in range(len(pairs))]))
        elif initial_method == Initialization_type.modelI:
            # Initialize t from model I output 10 iterations
            t = em_algorithm(model=IBM_model.I,max_epoch=10,initial_method=Initialization_type.uniform,save_pickles=False)

    # Initialize jump distribution
    jump_dist = 1. / (2 * max_jump) * np.ones([1, 2 * max_jump])
    
    return t, jump_dist

In [12]:
# Train

def em_step_modelII(t, jump_dist, max_jump, file_f, file_e, null='<NULL>'):
    # Set to zero
    counts_e_f = defaultdict(float) # counts words e and f happen together
    counts_e = defaultdict(float) # counts word e happens
    counts_jump = [0] * max_jump*2 # counts per jump between words
    
    num_lines = len(file_f)
    
    for f_sentence, e_sentence in tqdm(zip(file_f,file_e),total=num_lines,  desc='E-step', leave=True):
        # Get lengths
        l = len(e_sentence)
        m = len(f_sentence)
        f_sentence = [None] + f_sentence
        e_sentence = [null] + e_sentence
      
        for i in range(1, m+1): # french
            f = f_sentence[i]
            den = sum(jump_dist[0, jump_func(x,i,l,m,max_jump)]*t[(f,e_sentence[x])] for x in range(0, l+1))
            assert den != 0, 'normalization denominator is zero. i: {}, l:{}, m:{}'.format(i,l,m)
            
            for j in range(0, l+1): # english
                e = e_sentence[j]
                
                jump_idx = jump_func(j, i, l, m, max_jump)
                delta = t[(f,e)] * jump_dist[0, jump_idx] / den

                counts_e_f[(e,f)] += delta
                counts_e[e] += delta
                counts_jump[jump_idx] += delta

    for e,f in tqdm(counts_e_f.keys(),  desc='M-step', leave=True):
        assert counts_e[e] != 0, 'counts_e[{}] is zero'.format(e)
        t[(f,e)] = counts_e_f[(e,f)] / counts_e[e]

    jump_den = sum(counts_jump)
    assert jump_den != 0, 'normalization denominator for jumps is zero'
    for i,c in enumerate(counts_jump):
        jump_dist[0,i] = c / jump_den        

    return t, jump_dist

In [13]:
# Jump function. From https://uva-slpl.github.io/nlp2/projects/2018/04/12/project1.html

def jump_func(i, j, m, n, max_jump):
    """
    Alignment of french word j to english word i. 
    i = 0, to ,m  (we use m as in Wilker's lecture slides -- length of English sentence)
    j = 1, to ,n  (we use n as in Wilker's lecture slides -- length of French sentence)
    That is: a_j = i
    with e.g. max_jump = 100
    from[-max_jump, max_jump] to [0, 2*max_jump + 1] 
    """
    # We normalise j by the lenght of the French sentence and scale the result to the length of the English sentence
    # this gives us a continuous value that is an interpolation of where we j would be in the English sentence
    # if alignments were a linear function of the length ratio
    jump = np.floor(i - (j * m / n)) 
    # then we collapse all jumps that are too far to the right to the maximum jump value allowed
    if jump > max_jump:  # or we collapse all jumps that are too far to the left to the maximum (negative) jump allowed
        jump = max_jump   
    elif jump < -max_jump:
        jump = -max_jump
    # Now we shift the jump values so they start from 0
    #  this is only necessary if you use python lists or numpy vectors to store jump probabilities
    #  otherwise, you can use a python dict and this shifting is not required since dicts can have negative keys
    idx = (int)(jump + max_jump)
    if idx >= 2*max_jump: # Fix for out of bounds index
        idx -= 1
    return idx

In [14]:
def create_alignments_modelII(t, jump_dist, max_jump, file_f, file_e, target, file_enc='utf-8', null='<NULL>'):
    # open file to write to
    with open(target,'w',encoding=file_enc) as tar:
        # for each sentence in list
        for line_num, (f_sentence,e_sentence) in tqdm(enumerate(zip(file_f,file_e)), total=len(file_f), desc='AlignII', leave=True):
            # Get lengths
            l = len(e_sentence)
            m = len(f_sentence)
            f_sentence = [None] + f_sentence
            e_sentence = [null] + e_sentence

            # for each word position in sentence, find the best alignment
            for i in range(1, m+1): # french
                max_p = 0
                max_ind = 0 #when no alignment is found, align to zero
                f = f_sentence[i]
                for j in range(0, l+1): # english
                    e = e_sentence[j]

                    if (f,e) in t:
                        p = t[(f,e)]*jump_dist[0, jump_func(j,i,l,m,max_jump)]

                        if p >= max_p:
                            max_p = p
                            max_ind = j

                if max_ind != 0: # Skip null alignments
                    # write to file. Output: sentence_line english_pos french_pos probability
                    tar.write('%d %d %d P %f\n'%(line_num, max_ind, i, max_p)) 

In [15]:
def calculate_log_likelihood_modelII(t, file_f, file_e, jump_dist, max_jump, null='<NULL>'):
    log_likelihood = 0
    N = 0
    for sentence_f, sentence_e in tqdm(zip(file_f,file_e),total=len(file_f),  desc='Calc LL', leave=True):
        l = len(sentence_e)
        m = len(sentence_f)

        sentence_e = [null] + sentence_e
        sentence_f = [None] + sentence_f
        
        for i in range(1, m+1): # french
            f = sentence_f[i]
            max_p = 0
            max_align_p = 0
            for j in range(0, l+1): # english
                e = sentence_e[j]
                if (f,e) in t and t[(f,e)] >= max_p:
                    max_p = t[(f,e)]
                    max_align_p = jump_dist[0, jump_func(j,i,l,m,max_jump)]                
            N += 1
            log_likelihood += np.log(max_align_p) + np.log(max_p)
    return log_likelihood, N

# Shared

In [16]:
def em_algorithm(model,
                 t=None, #Only used for model II
                 max_epoch=10, 
                 threshold=0.01,
                 initial_method=Initialization_type.uniform, #How to initialize t
                 terminate_method=Termination_type.epochs, 
                 train_file_f='data/training/hansards.36.2.f',
                 train_file_e='data/training/hansards.36.2.e',
                 validation_file_f='data/validation/dev.f',
                 validation_file_e='data/validation/dev.e',
                 validation_truth='data/validation/dev.wa.nonullalign',
                 test_file_f = 'data/testing/test/test.f',
                 test_file_e = 'data/testing/test/test.e',
                 test_truth = 'data/testing/answers/test.wa.nonullalign',
                 pickles_path='data/pickles/',
                 align_path='data/alignments/',
                 save_prefix='',
                 save_pickles=True,
                 use_VB=False,
                 calc_LL_train=True,
                 alpha=0.1, #Only used if VB is used
                 file_enc='utf-8'):
    
    # test if prefix exists and correct format
    if save_prefix != '' and save_prefix[-1]!='_':
        save_prefix+='_'
    
    # read in all the files
    with open(train_file_f, encoding=file_enc) as f:
        train_file_f = f.readlines()
    with open(train_file_e, encoding=file_enc) as f:
        train_file_e = f.readlines()
    with open(validation_file_f, encoding=file_enc) as f:
        validation_file_f = f.readlines()
    with open(validation_file_e, encoding=file_enc) as f:
        validation_file_e = f.readlines()
    with open(test_file_f, encoding=file_enc) as f:
        test_file_f = f.readlines()
    with open(test_file_e, encoding=file_enc) as f:
        test_file_e = f.readlines()
    
    # get word pairs from corpus
    pairs, vocab_f, vocab_e = create_pairs_and_update_files(train_file_f, train_file_e)
    update_files(vocab_f, vocab_e, validation_file_f, validation_file_e)
    update_files(vocab_f, vocab_e, test_file_f, test_file_e)

    #initialize parameters
    if model == IBM_model.I:
        t = init_params_modelI(initial_method, pairs)
    elif model == IBM_model.II:
        # For jump function
        max_jump = 100        
        t, jump_dist = init_params_modelII(initial_method, pairs, max_jump, t)
    
    tracker = Metrics_tracker(save_prefix, 
                              align_path, 
                              validation_truth, 
                              validation_file_f, 
                              validation_file_e,
                              train_file_f,
                              train_file_e,
                              test_truth,
                              test_file_f,
                              test_file_e,
                              vocab_f,
                              vocab_e,
                              calc_LL_train,
                              file_enc)
    
    # calculate initial scores before training
    tracker.track_metrics(0, model, t, jump_dist if model == IBM_model.II else None, 
                                       max_jump if model == IBM_model.II else None,
                                       use_VB,alpha)
    
    
    #print train result
    tracker.print_last_metrics('Init')
        
    # loop for max_epochs or till convergence is reached
    for epoch in range(1,max_epoch+1):
        print("start epoch: "+str(epoch))
        
        # do an EM step
        if model == IBM_model.I:
            t = em_step_modelI(t, train_file_f, train_file_e, use_VB, alpha)
        else:
            t, jump_dist = em_step_modelII(t, jump_dist, max_jump, train_file_f, train_file_e)
        
        # create AER results and calculate the loglikelihoods/perplexity
        tracker.track_metrics(epoch, model, t, jump_dist if model == IBM_model.II else None, 
                                               max_jump if model == IBM_model.II else None)
        tracker.print_last_metrics(epoch)
        
        #store train progress
        if save_pickles:
            pickle.dump(t, open( os.path.join(pickles_path,'{0}t_epoch{1}.p'.format(save_prefix,epoch)), "wb" ))
            if model == IBM_model.II:
                pickle.dump(jump_dist, open( os.path.join(pickles_path,'{0}jump_dist_epoch{1}.p'.format(save_prefix,epoch)), "wb" ))
        
        #test for convergence
        if terminate_method == Termination_type.perplexity_convergence:
            if (len(tracker.train_perplexities) > 2) and (abs(tracker.train_perplexities[-2]-train_perplexity) < threshold):
                print('Reached Convergence!')
                break
    
    # Dump metrics to pickles
    test_aer = tracker.calculate_aer_test(epoch+1,
                                   model,
                                   t,
                                   jump_dist if model == IBM_model.II else None,
                                   max_jump if model == IBM_model.II else None)
    print('=================\nTEST AER RESULT: {0}\n================='.format(test_aer))
    tracker.save_metrics(os.path.join(pickles_path,'{}metrics.p'.format(save_prefix)))
    
    if model == IBM_model.I:
        return t
    elif model == IBM_model.II:
        return t, jump_dist

---
# RUNNING THE SCRIPT

### RUNS
---

In [19]:
t = em_algorithm(model=IBM_model.II,initial_method=Initialization_type.random, max_epoch=5, calc_LL_train=True, save_prefix='modelII_init_random3')

Count Words:   0%|          | 422/231164 [00:00<00:54, 4204.49it/s]

creating pairs


Count Words: 100%|██████████| 231164/231164 [00:52<00:00, 4413.41it/s]
Pairs: 100%|██████████| 231164/231164 [00:56<00:00, 4056.10it/s]
AlignII: 100%|██████████| 37/37 [00:00<00:00, 438.85it/s]
Calc LL: 100%|██████████| 231164/231164 [03:11<00:00, 1206.95it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1321.67it/s]
E-step:   0%|          | 46/231164 [00:00<08:52, 433.99it/s]

Results Epoch: Init
AER:
	 validation:	0.8997975708502024
Log Likelihood:
	 train:		-24653513.765052523
	 validation:	-3862.5099861907556
Perplexity:
	 train:		210.94938644632637
	 validation:	212.12096628230967
start epoch: 1


E-step: 100%|██████████| 231164/231164 [25:55<00:00, 148.60it/s]
M-step: 100%|██████████| 11614796/11614796 [00:38<00:00, 301461.13it/s]
AlignII: 100%|██████████| 37/37 [00:00<00:00, 440.96it/s]
Calc LL: 100%|██████████| 231164/231164 [03:31<00:00, 1091.07it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1183.86it/s]


Results Epoch: 1
AER:
	 validation:	0.4464454976303317
Log Likelihood:
	 train:		-32826214.6950358
	 validation:	-5520.723935158106
Perplexity:
	 train:		1243.5095462508118
	 validation:	2115.4807234307614


E-step:   0%|          | 49/231164 [00:00<07:59, 482.19it/s]

start epoch: 2


E-step: 100%|██████████| 231164/231164 [25:19<00:00, 152.12it/s]
M-step: 100%|██████████| 11614796/11614796 [00:26<00:00, 438851.84it/s]
AlignII: 100%|██████████| 37/37 [00:00<00:00, 414.72it/s]
Calc LL: 100%|██████████| 231164/231164 [03:25<00:00, 1127.24it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1240.72it/s]


Results Epoch: 2
AER:
	 validation:	0.29333333333333333
Log Likelihood:
	 train:		-26293920.338745687
	 validation:	-4561.857130214608
Perplexity:
	 train:		301.1795528121136
	 validation:	559.5455896266167


E-step:   0%|          | 46/231164 [00:00<10:38, 361.83it/s]

start epoch: 3


E-step: 100%|██████████| 231164/231164 [25:22<00:00, 151.82it/s]
M-step: 100%|██████████| 11614796/11614796 [00:27<00:00, 418941.17it/s]
AlignII: 100%|██████████| 37/37 [00:00<00:00, 437.91it/s]
Calc LL: 100%|██████████| 231164/231164 [03:20<00:00, 1154.42it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1277.68it/s]


Results Epoch: 3
AER:
	 validation:	0.26095238095238094
Log Likelihood:
	 train:		-24241464.08600809
	 validation:	-4367.103713779296
Perplexity:
	 train:		192.9002602717651
	 validation:	427.09621331104825


E-step:   0%|          | 48/231164 [00:00<08:25, 456.82it/s]

start epoch: 4


E-step: 100%|██████████| 231164/231164 [26:59<00:00, 142.73it/s]
M-step: 100%|██████████| 11614796/11614796 [00:28<00:00, 403699.40it/s]
AlignII: 100%|██████████| 37/37 [00:00<00:00, 481.95it/s]
Calc LL: 100%|██████████| 231164/231164 [03:26<00:00, 1117.87it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 592.61it/s]


Results Epoch: 4
AER:
	 validation:	0.24688995215311005
Log Likelihood:
	 train:		-24245801.826868773
	 validation:	-4483.575399630228
Perplexity:
	 train:		193.08198214240596
	 validation:	501.97543207541526


E-step:   0%|          | 49/231164 [00:00<07:52, 489.25it/s]

start epoch: 5


E-step: 100%|██████████| 231164/231164 [25:31<00:00, 150.93it/s]
M-step: 100%|██████████| 11614796/11614796 [00:24<00:00, 472734.39it/s]
AlignII: 100%|██████████| 37/37 [00:00<00:00, 494.82it/s]
Calc LL: 100%|██████████| 231164/231164 [03:18<00:00, 1161.84it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1371.80it/s]


Results Epoch: 5
AER:
	 validation:	0.2440191387559809
Log Likelihood:
	 train:		-24682974.45047867
	 validation:	-4630.668666550881
Perplexity:
	 train:		212.30275735150735
	 validation:	615.5794573475407


AlignII: 100%|██████████| 447/447 [00:00<00:00, 548.35it/s]


TEST AER RESULT: 0.2149428252084945


#### Diana Runs

In [None]:
t = em_algorithm(model=IBM_model.II,initial_method=Initialization_type.random, max_epoch=5, calc_LL_train=False, save_prefix='modelII_init_random3',
                train_file_f='NLP2/NLP2-Projects/Project1/data/training/hansards.36.2.f',
train_file_e='NLP2/NLP2-Projects/Project1/data/training/hansards.36.2.e',
validation_file_f='NLP2/NLP2-Projects/Project1/data/validation/dev.f',
validation_file_e='NLP2/NLP2-Projects/Project1/data/validation/dev.e',
validation_truth='NLP2/NLP2-Projects/Project1/data/validation/dev.wa.nonullalign',
test_file_f = 'NLP2/NLP2-Projects/Project1/data/testing/test/test.f',
test_file_e = 'NLP2/NLP2-Projects/Project1/data/testing/test/test.e',
test_truth = 'NLP2/NLP2-Projects/Project1/data/testing/answers/test.wa.nonullalign',
pickles_path='NLP2/NLP2-Projects/Project1/data/pickles/',
align_path='NLP2/NLP2-Projects/Project1/data/alignments/')

creating pairs


Count Words: 100%|████████████████████| 231164/231164 [04:33<00:00, 846.04it/s]
Pairs: 100%|██████████████████████████| 231164/231164 [04:43<00:00, 814.17it/s]
AlignII: 100%|█████████████████████████████████| 37/37 [00:00<00:00, 45.48it/s]
Calc LL: 100%|████████████████████████████████| 37/37 [00:00<00:00, 171.19it/s]


Results Epoch: Init
AER:
	 validation:	0.896551724137931
Log Likelihood:
	 train:		-1
	 validation:	-3866.965145717926
Perplexity:
	 train:		-1
	 validation:	213.43574923865765
start epoch: 1


E-step:  93%|██████████████████████▎ | 215201/231164 [1:51:38<08:16, 32.13it/s]

# Trained  models evaluation

In [37]:
file_path = 'data/pickles/modelII_init_ibm1/modelII_report_ibm1_t_epoch'
file_path_jump = 'data/pickles/modelII_init_ibm1/modelII_report_ibm1_jump_dist_epoch'
save_prefix='modelII_init_ibm1_eval'
max_epochs = 5
max_jump = 100
model = IBM_model.II
use_VB=False
alpha=0.1
initial_method=Initialization_type.modelI
file_enc='utf-8'

In [39]:
validation_file_f='data/validation/dev.f' 
validation_file_e='data/validation/dev.e'
train_file_f='data/training/hansards.36.2.f'
train_file_e='data/training/hansards.36.2.e'
test_file_f = 'data/testing/test/test.f'
test_file_e = 'data/testing/test/test.e'

with open(train_file_f, encoding=file_enc) as f:
    train_file_f = f.readlines()
with open(train_file_e, encoding=file_enc) as f:
    train_file_e = f.readlines()
with open(validation_file_f, encoding=file_enc) as f:
    validation_file_f = f.readlines()
with open(validation_file_e, encoding=file_enc) as f:
    validation_file_e = f.readlines()
with open(test_file_f, encoding=file_enc) as f:
    test_file_f = f.readlines()
with open(test_file_e, encoding=file_enc) as f:
    test_file_e = f.readlines()

In [40]:
pairs, vocab_f, vocab_e = create_pairs_and_update_files(train_file_f, train_file_e)
update_files(vocab_f, vocab_e, validation_file_f, validation_file_e)
update_files(vocab_f, vocab_e, test_file_f, test_file_e) 

Count Words:   0%|          | 982/231164 [00:00<00:46, 4903.22it/s]

creating pairs


Count Words: 100%|██████████| 231164/231164 [00:57<00:00, 4003.34it/s]
Pairs: 100%|██████████| 231164/231164 [01:02<00:00, 3680.62it/s]


In [41]:
ttemp = pickle.load(open( "data/pickles/modelI/modelI_report_t_epoch10.p", "rb" ) )
ttemp, jump_dist = init_params_modelII(initial_method, pairs, max_jump, t=ttemp)
# ttemp = init_params_modelI(initial_method, pairs)

tracker = Metrics_tracker(save_prefix=save_prefix,
                          align_path='data/alignments/', 
                          validation_truth='data/validation/dev.wa.nonullalign', 
                          validation_file_f=validation_file_f, 
                          validation_file_e=validation_file_e,
                          train_file_f=train_file_f,
                          train_file_e=train_file_e,
                          test_truth = 'data/testing/answers/test.wa.nonullalign',
                          test_file_f=test_file_f,
                          test_file_e=test_file_e,
                          vocab_f=vocab_f,
                          vocab_e=vocab_e,
                          calc_LL_train=True,
                          file_enc='utf-8')
tracker.track_metrics(0, model, ttemp, jump_dist if model == IBM_model.II else None, 
                                       max_jump if model == IBM_model.II else None,
                                       use_VB, alpha)
tracker.print_last_metrics('init')
for epoch in range(1,max_epochs+1):
    path = file_path+str(epoch)+str('.p')
    if model == IBM_model.II:
        path_jump = file_path_jump+str(epoch)+str('.p')
        jump_dist = pickle.load(open( path_jump, "rb" ) )
    ttemp = pickle.load(open( path, "rb" ) )
    tracker.track_metrics(0, model, ttemp, jump_dist if model == IBM_model.II else None, 
                                           max_jump if model == IBM_model.II else None,
                                           use_VB, alpha)
    tracker.print_last_metrics(epoch)
    

AlignII: 100%|██████████| 37/37 [00:00<00:00, 457.05it/s]
Calc LL: 100%|██████████| 231164/231164 [03:59<00:00, 963.52it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1092.11it/s]


Results Epoch: init
AER:
	 validation:	0.3695652173913043
Log Likelihood:
	 train:		-31301820.420033842
	 validation:	-5165.251025835169
Perplexity:
	 train:		893.1796428254802
	 validation:	1292.081476383158


AlignII: 100%|██████████| 37/37 [00:00<00:00, 400.64it/s]
Calc LL: 100%|██████████| 231164/231164 [03:56<00:00, 976.76it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1098.84it/s]


Results Epoch: 1
AER:
	 validation:	0.246390760346487
Log Likelihood:
	 train:		-24454024.121539105
	 validation:	-4272.308264231719
Perplexity:
	 train:		202.00942723800594
	 validation:	374.4774446115372


AlignII: 100%|██████████| 37/37 [00:00<00:00, 396.88it/s]
Calc LL: 100%|██████████| 231164/231164 [03:53<00:00, 991.50it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 982.74it/s]


Results Epoch: 2
AER:
	 validation:	0.24472168905950098
Log Likelihood:
	 train:		-24192558.08525369
	 validation:	-4246.162775960261
Perplexity:
	 train:		190.86322745128425
	 validation:	361.14110697258474


AlignII: 100%|██████████| 37/37 [00:00<00:00, 377.27it/s]
Calc LL: 100%|██████████| 231164/231164 [03:53<00:00, 990.01it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1155.09it/s]


Results Epoch: 3
AER:
	 validation:	0.24161073825503354
Log Likelihood:
	 train:		-24665514.032717485
	 validation:	-4380.244201896991
Perplexity:
	 train:		211.49961265806988
	 validation:	434.95156353554165


AlignII: 100%|██████████| 37/37 [00:00<00:00, 360.08it/s]
Calc LL: 100%|██████████| 231164/231164 [04:12<00:00, 914.05it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1166.23it/s]


Results Epoch: 4
AER:
	 validation:	0.239193083573487
Log Likelihood:
	 train:		-24958596.325552233
	 validation:	-4544.4270625231275
Perplexity:
	 train:		225.39253708222182
	 validation:	546.180854401027


AlignII: 100%|██████████| 37/37 [00:00<00:00, 380.12it/s]
Calc LL: 100%|██████████| 231164/231164 [04:11<00:00, 919.76it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1102.59it/s]

Results Epoch: 5
AER:
	 validation:	0.23320537428023036
Log Likelihood:
	 train:		-24986903.331971664
	 validation:	-4700.752832791115
Perplexity:
	 train:		226.78176902415203
	 validation:	678.421072017162





In [42]:
test_aer = tracker.calculate_aer_test(epoch+1,
                                      model,
                                      ttemp,
                                      jump_dist if model == IBM_model.II else None,
                                      max_jump if model == IBM_model.II else None)
tracker.save_metrics('data/pickles/modelII_init_ibm1_metrics.p')

AlignII: 100%|██████████| 447/447 [00:00<00:00, 510.85it/s]


In [80]:
jump_dist = pickle.load(open( "data/pickles/FullModel2_t_init_uniform/modelII_init_uniform_with_LOW_jump_dist_epoch5.p", "rb" ) )
ttemp = pickle.load(open( "data/pickles/FullModel2_t_init_uniform/modelII_init_uniform_with_LOW_t_epoch5.p", "rb" ) )
test_aer = tracker.calculate_aer_test('test',
                                      IBM_model.II,
                                      ttemp,
                                      jump_dist,
                                      100)

AlignII: 100%|██████████| 447/447 [00:00<00:00, 540.29it/s]


In [43]:
test_aer

0.21582485778314087