# Project 1

### Initialise
---

#### import needed module

In [19]:
import numpy as np
from collections import Counter
from collections import defaultdict
from ipywidgets import *
from tqdm import tqdm_notebook, tqdm
from aer import *
import pickle
from copy import deepcopy
import os
from enum import Enum
from scipy.special import digamma
from random import random

import mmap

#### create supporting functions
---

In [20]:
def create_pairs_and_update_files(file_f,file_e,null='<NULL>'):
    """
    given a french and an english file, it will loop over the pairs 
    of lines. For every occuring combination in a line, a pair is 
    added to the dict. 
    """
    print("creating pairs")
    counter_e = Counter()
    counter_f = Counter()
    for line_num, (line_f, line_e) in tqdm(enumerate(zip(file_f,file_e)),total=len(file_f), desc='Count Words', leave=True):
        words_f = line_f.split()
        words_e = line_e.split()
        file_f[line_num] = words_f
        file_e[line_num] = words_e
        
        for word_f in words_f:
            counter_f[word_f] += 1
            for word_e in words_e:
                counter_e[word_e] += 1
                    
    fe_pairs = dict()
    vocab_e = set()
    vocab_f = set()
    fe_pairs[('<LOW>','<LOW>')] = 1
    for line_num, (line_f, line_e) in tqdm(enumerate(zip(file_f,file_e)),total=len(file_f), desc='Pairs', leave=True):
        c = 0
        for i,f in enumerate(line_f):
            if counter_f[f] == 1:
                file_f[line_num][i] = '<LOW>'
                f = '<LOW>'
            fe_pairs[(f, null)] = 1
            if f not in vocab_f:
                vocab_f.update([f])
            for j,e in enumerate(line_e):
                if not c and e != '<LOW>' and e != null and counter_e[e] == 1:
                    file_e[line_num][j] = '<LOW>'
                    e = '<LOW>'
                fe_pairs[(f,e)] = 1
                if not c and e not in vocab_e:
                    vocab_e.update([e])
            c += 1

    return fe_pairs, vocab_f, vocab_e

def update_files(vocab_f,vocab_e,file_f,file_e,null='<NULL>'):
    """
    given a french and an english file and the vocabularies, 
    it will update the files with non occuring 
    """
    for line_num, (line_f, line_e) in enumerate(zip(file_f,file_e)):
        words_f = line_f.split()
        words_e = line_e.split()
        for i,f in enumerate(words_f):
            if f not in vocab_f:
                words_f[i] = '<LOW>'
        for i,e in enumerate(words_e):
            if e not in vocab_e:
                words_e[i] = '<LOW>'
        file_f[line_num] = words_f
        file_e[line_num] = words_e

Different enum classes, that can are used to set certain hyperparameters for training the model. 

In [21]:
class IBM_model(Enum):
    I = 1
    II = 2
    
class Initialization_type(Enum):
    uniform = 1
    random = 2
    modelI = 3
    
class Termination_type(Enum):
    epochs = 1
    perplexity_convergence = 2

### Metrics calculation
---

In [22]:
def calculate_perplexities(model,t,jump_dist,max_jump,file_f_train,file_e_train,file_f_val,file_e_val,calc_LL_train,null='<NULL>'):
    """
    Given the model, it determines which perplexity calculation should be done. 
    It calculates the perplexity for both the training and the validation data. 
    """
    train_log_likelihood = -1 
    train_perplexity = -1
    if calc_LL_train:
        train_log_likelihood, train_perplexity = calculate_perplexity(model,
                                                                     t,
                                                                     jump_dist,
                                                                     max_jump,
                                                                     file_f_train,
                                                                     file_e_train,
                                                                     null=null)        
    val_log_likelihood, val_perplexity = calculate_perplexity(model,
                                                             t,
                                                             jump_dist,
                                                             max_jump,
                                                             file_f_val,
                                                             file_e_val,
                                                             null=null)
    
    return train_log_likelihood, val_log_likelihood, train_perplexity, val_perplexity

In [23]:
def calculate_perplexity(model,t,jump_dist, max_jump,file_f,file_e,null='<NULL>'):
    if model == IBM_model.I:
        log_likelihood,N = calculate_log_likelihood_modelI(t, file_f, file_e, null)
    else:
        log_likelihood,N = calculate_log_likelihood_modelII(t, file_f, file_e, jump_dist, max_jump, null)
    
    perplexity = np.exp(-1*log_likelihood/N)
    return log_likelihood,perplexity

In [24]:
class Metrics_tracker:
    def __init__(self,
                 save_prefix, 
                 align_path, 
                 validation_truth, 
                 validation_file_f, 
                 validation_file_e,
                 train_file_f,
                 train_file_e,
                 test_truth,
                 test_file_f,
                 test_file_e,
                 vocab_f,
                 vocab_e,
                 calc_LL_train,
                 file_enc):
                
        self.save_prefix = save_prefix
        self.align_path = align_path
        self.validation_truth = validation_truth
        self.validation_file_f = validation_file_f
        self.validation_file_e = validation_file_e
        self.train_file_f = train_file_f
        self.train_file_e = train_file_e
        self.test_truth = test_truth
        self.test_file_f = test_file_f
        self.test_file_e = test_file_e
        self.vocab_f = vocab_f
        self.vocab_e = vocab_e
        self.calc_LL_train = calc_LL_train
        self.file_enc = file_enc
        
        # Track
        self.val_aers = []
        self.train_log_likelihoods = []
        self.val_log_likelihoods = []
        self.train_perplexities = []
        self.val_perplexities = []
        self.test_aer = None
    
    def track_metrics(self, epoch, model, t, jump_dist=None, max_jump=None):
        aer = self.calculate_aer_validation(epoch, model, t, jump_dist, max_jump)
        train_ll, val_ll, train_pp, val_pp = self.calculate_perplexities(model, t, jump_dist, max_jump)
        
        # Store
        self.val_aers.append(aer)
        self.train_log_likelihoods.append(train_ll)
        self.val_log_likelihoods.append(val_ll)
        self.train_perplexities.append(train_pp)
        self.val_perplexities.append(val_pp)
        
    def print_last_metrics(self, epoch = None, aer=True,train_ll=True,val_ll=True,train_pp=True,val_pp=True):
        if epoch == None:
            epoch = len(val_aers)
        print('Results Epoch: '+str(epoch))
        print('====================')
        if aer and self.val_aers:
            print('AER:\n\t validation:\t{0}'.format(self.val_aers[-1]))
        if train_ll or val_ll:
            print('Log Likelihood:')
            if train_ll and self.train_log_likelihoods:
                print('\t train:\t\t{0}'.format(self.train_log_likelihoods[-1]))
            if val_ll and self.val_log_likelihoods:
                print('\t validation:\t{0}'.format(self.val_log_likelihoods[-1]))
        if train_pp or val_pp:
            print('Perplexity:')
            if train_pp and self.train_perplexities:
                print('\t train:\t\t{0}'.format(self.train_perplexities[-1]))
            if val_ll and self.val_perplexities:
                print('\t validation:\t{0}'.format(self.val_perplexities[-1]))
    
    def save_metrics(self, file_name = 'metrics.p'):
        metrics = {'train_log_likelihoods': self.train_log_likelihoods,
                   'val_log_likelihoods': self.val_log_likelihoods,
                   'train_perplexities': self.train_perplexities,
                   'val_perplexities': self.val_perplexities,
                   'val_aers': self.val_aers,
                   'test_aer': self.test_aer}
        pickle.dump(metrics, open(file_name, "wb"))
    
    def calculate_aer_validation(self, epoch, model, t, jump_dist=None, max_jump=None):
        return self.calculate_aer(epoch, model, t, self.validation_file_f, self.validation_file_e, self.validation_truth, jump_dist, max_jump)
        
    def calculate_aer_test(self, epoch, model, t, jump_dist=None, max_jump=None):
        self.test_aer = self.calculate_aer(epoch, model, t, self.test_file_f, self.test_file_e, self.test_truth, jump_dist, max_jump)
        return self.test_aer
    
    def calculate_aer(self, epoch, model, t, file_f, file_e, file_truth, jump_dist, max_jump):
        align_file = os.path.join(self.align_path,'{0}validation_epoch{1}.align'.format(self.save_prefix, epoch))
        if model == IBM_model.I:
            create_alignments_modelI(t, file_f, file_e, align_file, self.file_enc)
        elif model == IBM_model.II:
            create_alignments_modelII(t, jump_dist, max_jump, file_f, file_e, align_file, self.file_enc)

        aer = test(file_truth, align_file)
        return aer
    
    def calculate_perplexities(self, model, t, jump_dist, max_jump):
        train_ll, val_ll, train_pp, val_pp = calculate_perplexities(model,
                                                                    t,
                                                                    jump_dist,
                                                                    max_jump,
                                                                    self.train_file_f,
                                                                    self.train_file_e,
                                                                    self.validation_file_f,
                                                                    self.validation_file_e,
                                                                    self.calc_LL_train)

        return train_ll, val_ll, train_pp, val_pp

# IBM I
---

In [25]:
def init_params_modelI(initial_method, pairs, null='<NULL>'):
    # Returns: t[(f,e)] - translation probabilities
    
    assert initial_method == Initialization_type.uniform, 'Unsupported initalization method {} for IBM model I'.format(initial_method)
    
    e_vocab_size = sum(1 for k,v in tqdm(pairs,  desc='Init Norm', leave=True) if v != null)
    t = dict(zip(pairs,[1.0/e_vocab_size]*len(pairs)))    
    return t

In [26]:
# Train
'''
E-step:
    for each word j in french sentence:
        the probability of fj|ei divided by (for t=0>m: fj|et)
        
M-step:
    E[fe]/E[e]
'''
def em_step_modelI(t, file_f, file_e, use_VB, alpha):
    num_lines = len(file_f)
    
    # Set to zero
    cooccurrences = defaultdict(float) # count words e and f happen together
    total_f = defaultdict(float) # count word f happens
    counter_f = Counter()
    total_e = defaultdict(float) # count word e happens
    
    for f_sentence, e_sentence in tqdm(zip(file_f,file_e),total=num_lines,  desc='E-step', leave=True):
        for e in e_sentence:
            total_e[e] = 0
            for f in f_sentence:
                total_e[e] += t[(f,e)]
                counter_f[f] += 1

        for e in e_sentence:
            for f in f_sentence:
                temp = t[(f,e)] / total_e[e]
                cooccurrences[(f,e)] += temp
                total_f[f] += temp

    for f,e in tqdm(cooccurrences.keys(),  desc='M-Step', leave=True):
        if use_VB:
            #theta_f|e =  exp( digamma(lambda_f|e) - digamma(sum_f' lambda(f'|e))) where lambda_f|e = E(#f-e)+alpha_f   
            t[(f,e)] = np.exp( digamma(cooccurrences[(f,e)] + alpha) - digamma(total_f[f] + counter_f[f]*alpha))
        else:
            t[(f,e)] = cooccurrences[(f,e)] / total_f[f]
        
    return t

In [27]:
def create_alignments_modelI(t, file_f, file_e, target, file_enc='utf-8'):
    # open file to write to
    with open(target,'w',encoding=file_enc) as tar:
        # for each sentence in list
        for line_num, (f_sentence,e_sentence) in tqdm(enumerate(zip(file_f,file_e)),total=len(file_f),  desc='AlignI', leave=True):
            # for each word in sentence, find the best alignment
            for ind_f,f in enumerate(f_sentence):
                ind_f += 1 #0 is reserved for null
                max_ind_e = 0 #when no alignment is found, align to zero
                max_p = 0
                for ind_e,e in enumerate(e_sentence):
                    ind_e += 1 #0 is reserved for null
                    if (f,e) in t:
                        if t[(f,e)] > max_p:
                            max_p = t[(f,e)]
                            max_ind_e = ind_e

                if max_ind_e != 0: # Skip null alignments
                    # write to file. Output: sentence_line english_pos french_pos probability
                    tar.write('%d %d %d P %f\n'%(line_num, max_ind_e, ind_f, max_p)) 

In [28]:
def calculate_log_likelihood_modelI(t, file_f, file_e, null='<NULL>'):
    log_likelihood = 0
    N = 0
    for sentence_f, sentence_e in tqdm(zip(file_f,file_e),total=len(file_f), desc='Calc LL', leave=True):
        l = len(sentence_e)
        m = len(sentence_f)
        sentence_e = [null] + sentence_e
        
        #                np.log(-m * np.log(l + 1))
        
        alignment_prob = -np.log(m*np.log(l+1)) #+ np.log(-m)

        for f in sentence_f:
            max_p = 0
            for e in sentence_e:
                if (f,e) in t and t[(f,e)] > max_p:
                    max_p = t[(f,e)]        
            N += 1
            log_likelihood += alignment_prob + np.log(max_p)
    return log_likelihood, N

# IBM II
---

In [29]:
def init_params_modelII(initial_method, pairs, max_jump, t=None, null='<NULL>'):
    # Returns: t[(f,e)] and jump_dist
    
    if t == None:
        if initial_method == Initialization_type.uniform:
            t = init_params_modelI(initial_method, pairs)
        elif initial_method == Initialization_type.random:
            t = dict(zip(pairs,[random() for x in range(len(pairs))]))
        elif initial_method == Initialization_type.modelI:
            # Initialize t from model I output 10 iterations
            t = em_algorithm(model=IBM_model.I,max_epoch=10,initial_method=Initialization_type.uniform,save_pickles=False)

    # Initialize jump distribution
    jump_dist = 1. / (2 * max_jump) * np.ones([1, 2 * max_jump])
    
    return t, jump_dist

In [30]:
# Train

def em_step_modelII(t, jump_dist, max_jump, file_f, file_e, null='<NULL>'):
    # Set to zero
    counts_e_f = defaultdict(float) # counts words e and f happen together
    counts_e = defaultdict(float) # counts word e happens
    counts_jump = [0] * max_jump*2 # counts per jump between words
    
    num_lines = len(file_f)
    
    for f_sentence, e_sentence in tqdm(zip(file_f,file_e),total=num_lines,  desc='E-step', leave=True):
        # Get lengths
        l = len(e_sentence)
        m = len(f_sentence)
        f_sentence = [None] + f_sentence
        e_sentence = [null] + e_sentence
      
        for i in range(1, m+1): # french
            f = f_sentence[i]
            den = sum(jump_dist[0, jump_func(x,i,l,m,max_jump)]*t[(f,e_sentence[x])] for x in range(0, l+1))
            assert den != 0, 'normalization denominator is zero. i: {}, l:{}, m:{}'.format(i,l,m)
            
            for j in range(0, l+1): # english
                e = e_sentence[j]
                
                jump_idx = jump_func(j, i, l, m, max_jump)
                delta = t[(f,e)] * jump_dist[0, jump_idx] / den

                counts_e_f[(e,f)] += delta
                counts_e[e] += delta
                counts_jump[jump_idx] += delta

    for e,f in tqdm(counts_e_f.keys(),  desc='M-step', leave=True):
        assert counts_e[e] != 0, 'counts_e[{}] is zero'.format(e)
        t[(f,e)] = counts_e_f[(e,f)] / counts_e[e]

    jump_den = sum(counts_jump)
    assert jump_den != 0, 'normalization denominator for jumps is zero'
    for i,c in enumerate(counts_jump):
        jump_dist[0,i] = c / jump_den        

    return t, jump_dist

In [31]:
# Jump function. From https://uva-slpl.github.io/nlp2/projects/2018/04/12/project1.html

def jump_func(i, j, m, n, max_jump):
    """
    Alignment of french word j to english word i. 
    i = 0, to ,m  (we use m as in Wilker's lecture slides -- length of English sentence)
    j = 1, to ,n  (we use n as in Wilker's lecture slides -- length of French sentence)
    That is: a_j = i
    with e.g. max_jump = 100
    from[-max_jump, max_jump] to [0, 2*max_jump + 1] 
    """
    # We normalise j by the lenght of the French sentence and scale the result to the length of the English sentence
    # this gives us a continuous value that is an interpolation of where we j would be in the English sentence
    # if alignments were a linear function of the length ratio
    jump = np.floor(i - (j * m / n)) 
    # then we collapse all jumps that are too far to the right to the maximum jump value allowed
    if jump > max_jump:  # or we collapse all jumps that are too far to the left to the maximum (negative) jump allowed
        jump = max_jump   
    elif jump < -max_jump:
        jump = -max_jump
    # Now we shift the jump values so they start from 0
    #  this is only necessary if you use python lists or numpy vectors to store jump probabilities
    #  otherwise, you can use a python dict and this shifting is not required since dicts can have negative keys
    idx = (int)(jump + max_jump)
    if idx >= 2*max_jump: # Fix for out of bounds index
        idx -= 1
    return idx

In [32]:
def create_alignments_modelII(t, jump_dist, max_jump, file_f, file_e, target, file_enc='utf-8', null='<NULL>'):
    # open file to write to
    with open(target,'w',encoding=file_enc) as tar:
        # for each sentence in list
        for line_num, (f_sentence,e_sentence) in tqdm(enumerate(zip(file_f,file_e)), total=len(file_f), desc='AlignII', leave=True):
            # Get lengths
            l = len(e_sentence)
            m = len(f_sentence)
            f_sentence = [None] + f_sentence
            e_sentence = [null] + e_sentence

            # for each word position in sentence, find the best alignment
            for i in range(1, m+1): # french
                max_p = 0
                max_ind = 0 #when no alignment is found, align to zero
                f = f_sentence[i]
                for j in range(0, l+1): # english
                    e = e_sentence[j]

                    if (f,e) in t:
                        p = t[(f,e)]*jump_dist[0, jump_func(j,i,l,m,max_jump)]

                        if p >= max_p:
                            max_p = p
                            max_ind = j

                if max_ind != 0: # Skip null alignments
                    # write to file. Output: sentence_line english_pos french_pos probability
                    tar.write('%d %d %d P %f\n'%(line_num, max_ind, i, max_p)) 

In [33]:
def calculate_log_likelihood_modelII(t, file_f, file_e, jump_dist, max_jump, null='<NULL>'):
    log_likelihood = 0
    N = 0
    for sentence_f, sentence_e in tqdm(zip(file_f,file_e),total=len(file_f),  desc='Calc LL', leave=True):
        l = len(sentence_e)
        m = len(sentence_f)

        sentence_e = [null] + sentence_e
        sentence_f = [None] + sentence_f
        
        for i in range(1, m+1): # french
            f = sentence_f[i]
            max_p = 0
            max_align_p = 0
            for j in range(0, l+1): # english
                e = sentence_e[j]
                if (f,e) in t and t[(f,e)] >= max_p:
                    max_p = t[(f,e)]
                    max_align_p = jump_dist[0, jump_func(j,i,l,m,max_jump)]                
            N += 1
            log_likelihood += np.log(max_align_p) + np.log(max_p)
    return log_likelihood, N

# Shared

In [34]:
def em_algorithm(model,
                 t=None, #Only used for model II
                 max_epoch=10, 
                 threshold=0.01,
                 initial_method=Initialization_type.uniform, #How to initialize t
                 terminate_method=Termination_type.epochs, 
                 train_file_f='data/training/hansards.36.2.f',
                 train_file_e='data/training/hansards.36.2.e',
                 validation_file_f='data/validation/dev.f',
                 validation_file_e='data/validation/dev.e',
                 validation_truth='data/validation/dev.wa.nonullalign',
                 test_file_f = 'data/testing/test/test.f',
                 test_file_e = 'data/testing/test/test.e',
                 test_truth = 'data/testing/answers/test.wa.nonullalign',
                 pickles_path='data/pickles/',
                 align_path='data/alignments/',
                 save_prefix='',
                 save_pickles=True,
                 use_VB=False,
                 calc_LL_train=True,
                 alpha=0.1, #Only used if VB is used
                 file_enc='utf-8'):
    
    # test if prefix exists and correct format
    if save_prefix != '' and save_prefix[-1]!='_':
        save_prefix+='_'
    
    # read in all the files
    with open(train_file_f, encoding=file_enc) as f:
        train_file_f = f.readlines()
    with open(train_file_e, encoding=file_enc) as f:
        train_file_e = f.readlines()
    with open(validation_file_f, encoding=file_enc) as f:
        validation_file_f = f.readlines()
    with open(validation_file_e, encoding=file_enc) as f:
        validation_file_e = f.readlines()
    with open(test_file_f, encoding=file_enc) as f:
        test_file_f = f.readlines()
    with open(test_file_e, encoding=file_enc) as f:
        test_file_e = f.readlines()
    
    # get word pairs from corpus
    pairs, vocab_f, vocab_e = create_pairs_and_update_files(train_file_f, train_file_e)
    update_files(vocab_f, vocab_e, validation_file_f, validation_file_e)
    update_files(vocab_f, vocab_e, test_file_f, test_file_e)

    #initialize parameters
    if model == IBM_model.I:
        t = init_params_modelI(initial_method, pairs)
    elif model == IBM_model.II:
        # For jump function
        max_jump = 100        
        t, jump_dist = init_params_modelII(initial_method, pairs, max_jump, t)
    
    tracker = Metrics_tracker(save_prefix, 
                              align_path, 
                              validation_truth, 
                              validation_file_f, 
                              validation_file_e,
                              train_file_f,
                              train_file_e,
                              test_truth,
                              test_file_f,
                              test_file_e,
                              vocab_f,
                              vocab_e,
                              calc_LL_train,
                              file_enc)
    
    # calculate initial scores before training
    tracker.track_metrics(0, model, t, jump_dist if model == IBM_model.II else None, 
                                       max_jump if model == IBM_model.II else None)
    
    
    #print train result
    tracker.print_last_metrics('Init')
        
    # loop for max_epochs or till convergence is reached
    for epoch in range(1,max_epoch+1):
        print("start epoch: "+str(epoch))
        
        # do an EM step
        if model == IBM_model.I:
            t = em_step_modelI(t, train_file_f, train_file_e, use_VB, alpha)
        else:
            t, jump_dist = em_step_modelII(t, jump_dist, max_jump, train_file_f, train_file_e)
        
        # create AER results and calculate the loglikelihoods/perplexity
        tracker.track_metrics(epoch, model, t, jump_dist if model == IBM_model.II else None, 
                                               max_jump if model == IBM_model.II else None)
        tracker.print_last_metrics(epoch)
        
        #store train progress
        if save_pickles:
            pickle.dump(t, open( os.path.join(pickles_path,'{0}t_epoch{1}.p'.format(save_prefix,epoch)), "wb" ))
            if model == IBM_model.II:
                pickle.dump(jump_dist, open( os.path.join(pickles_path,'{0}jump_dist_epoch{1}.p'.format(save_prefix,epoch)), "wb" ))
        
        #test for convergence
        if terminate_method == Termination_type.perplexity_convergence:
            if (len(tracker.train_perplexities) > 2) and (abs(tracker.train_perplexities[-2]-train_perplexity) < threshold):
                print('Reached Convergence!')
                break
    
    # Dump metrics to pickles
    test_aer = tracker.calculate_aer_test(epoch+1,
                                   model,
                                   t,
                                   jump_dist if model == IBM_model.II else None,
                                   max_jump if model == IBM_model.II else None)
    print('=================\nTEST AER RESULT: {0}\n================='.format(test_aer))
    tracker.save_metrics(os.path.join(pickles_path,'{}metrics.p'.format(save_prefix)))
    
    if model == IBM_model.I:
        return t
    elif model == IBM_model.II:
        return t, jump_dist

---
# RUNNING THE SCRIPT

### RUNS
---

In [47]:
# Run model I
t = em_algorithm(model=IBM_model.I, max_epoch=10, save_prefix='modelI_report')
#                  ,
#                  train_file_f='data/validation/dev.f',
#                  train_file_e='data/validation/dev.e')

Count Words:   0%|          | 195/231164 [00:00<01:58, 1949.17it/s]

creating pairs


Count Words: 100%|██████████| 231164/231164 [01:56<00:00, 1976.63it/s]
Pairs: 100%|██████████| 231164/231164 [01:36<00:00, 2386.56it/s]
Init Norm: 100%|██████████| 11614796/11614796 [00:14<00:00, 810577.80it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1739.05it/s]
Calc LL: 100%|██████████| 231164/231164 [02:56<00:00, 1307.98it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1675.46it/s]
E-step:   0%|          | 141/231164 [00:00<05:29, 700.52it/s]

Results Epoch: Init
AER:
	 validation:	0.9065155807365439
Log Likelihood:
	 train:		-2684.922037257008
	 validation:	-1613.9292653144628
Perplexity:
	 train:		1.0005829946141471
	 validation:	9.378871556029178
start epoch: 1


E-step: 100%|██████████| 231164/231164 [07:30<00:00, 513.39it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:32<00:00, 354430.33it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1683.11it/s]
Calc LL: 100%|██████████| 231164/231164 [02:43<00:00, 1411.99it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1610.82it/s]


Results Epoch: 1
AER:
	 validation:	0.8016997167138811
Log Likelihood:
	 train:		-2415.721303235616
	 validation:	-1352.3986734140342
Perplexity:
	 train:		1.0005245259776225
	 validation:	6.525557012167654


E-step:   0%|          | 86/231164 [00:00<04:31, 851.74it/s]

start epoch: 2


E-step: 100%|██████████| 231164/231164 [07:50<00:00, 491.22it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:29<00:00, 396321.61it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1731.69it/s]
Calc LL: 100%|██████████| 231164/231164 [20:05<00:00, 191.77it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1675.46it/s]


Results Epoch: 2
AER:
	 validation:	0.49008498583569404
Log Likelihood:
	 train:		-2401.468665188581
	 validation:	-1339.9927730119987
Perplexity:
	 train:		1.0005214304932093
	 validation:	6.414235365164364


E-step:   0%|          | 133/231164 [00:00<05:53, 654.00it/s]

start epoch: 3


E-step: 100%|██████████| 231164/231164 [07:37<00:00, 504.86it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:31<00:00, 368805.96it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1600.53it/s]
Calc LL: 100%|██████████| 231164/231164 [02:40<00:00, 1443.76it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1826.16it/s]


Results Epoch: 3
AER:
	 validation:	0.4230406043437205
Log Likelihood:
	 train:		-2395.263123189799
	 validation:	-1334.9009834654034
Perplexity:
	 train:		1.0005200827345826
	 validation:	6.36909683068159


E-step:   0%|          | 81/231164 [00:00<04:48, 801.57it/s]

start epoch: 4


E-step: 100%|██████████| 231164/231164 [07:26<00:00, 518.16it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:28<00:00, 413393.11it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1729.63it/s]
Calc LL: 100%|██████████| 231164/231164 [02:36<00:00, 1477.73it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1746.96it/s]


Results Epoch: 4
AER:
	 validation:	0.39565627950897075
Log Likelihood:
	 train:		-2392.868572885601
	 validation:	-1333.2165482318226
Perplexity:
	 train:		1.000519562671578
	 validation:	6.354234405110042


E-step:   0%|          | 86/231164 [00:00<04:29, 858.29it/s]

start epoch: 5


E-step: 100%|██████████| 231164/231164 [07:28<00:00, 514.85it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:29<00:00, 391693.81it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1627.95it/s]
Calc LL: 100%|██████████| 231164/231164 [02:36<00:00, 1475.12it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1743.56it/s]


Results Epoch: 5
AER:
	 validation:	0.38243626062322944
Log Likelihood:
	 train:		-2391.7396050427124
	 validation:	-1332.6361177644642
Perplexity:
	 train:		1.0005193174755664
	 validation:	6.349121080447569


E-step:   0%|          | 85/231164 [00:00<04:33, 843.69it/s]

start epoch: 6


E-step: 100%|██████████| 231164/231164 [07:26<00:00, 517.77it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:35<00:00, 325941.80it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1651.78it/s]
Calc LL: 100%|██████████| 231164/231164 [02:36<00:00, 1477.36it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1748.69it/s]


Results Epoch: 6
AER:
	 validation:	0.3729933899905571
Log Likelihood:
	 train:		-2391.112790368089
	 validation:	-1332.4648036622189
Perplexity:
	 train:		1.0005191813402197
	 validation:	6.347612668843216


E-step:   0%|          | 87/231164 [00:00<04:29, 858.48it/s]

start epoch: 7


E-step: 100%|██████████| 231164/231164 [07:26<00:00, 517.33it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:28<00:00, 409809.38it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1721.84it/s]
Calc LL: 100%|██████████| 231164/231164 [02:36<00:00, 1473.90it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1668.72it/s]


Results Epoch: 7
AER:
	 validation:	0.3682719546742209
Log Likelihood:
	 train:		-2390.795344446599
	 validation:	-1332.551271484064
Perplexity:
	 train:		1.0005191123954296
	 validation:	6.348373968641693


E-step:   0%|          | 83/231164 [00:00<04:40, 822.94it/s]

start epoch: 8


E-step: 100%|██████████| 231164/231164 [07:31<00:00, 512.38it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:30<00:00, 374216.98it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1494.00it/s]
Calc LL: 100%|██████████| 231164/231164 [02:36<00:00, 1475.45it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1046.92it/s]


Results Epoch: 8
AER:
	 validation:	0.36449480642115206
Log Likelihood:
	 train:		-2390.6379785837767
	 validation:	-1332.7894490089827
Perplexity:
	 train:		1.0005190782177789
	 validation:	6.350471457924301


E-step:   0%|          | 83/231164 [00:00<04:41, 819.91it/s]

start epoch: 9


E-step: 100%|██████████| 231164/231164 [07:34<00:00, 508.79it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:29<00:00, 390204.71it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1418.61it/s]
Calc LL: 100%|██████████| 231164/231164 [02:43<00:00, 1414.82it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1581.48it/s]


Results Epoch: 9
AER:
	 validation:	0.3616619452313503
Log Likelihood:
	 train:		-2390.5883109717
	 validation:	-1333.0700752772539
Perplexity:
	 train:		1.0005190674306728
	 validation:	6.35294365760056


E-step:   0%|          | 88/231164 [00:00<04:30, 854.13it/s]

start epoch: 10


E-step: 100%|██████████| 231164/231164 [07:35<00:00, 507.32it/s]
M-Step: 100%|██████████| 11583295/11583295 [00:32<00:00, 360098.15it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 1731.21it/s]
Calc LL: 100%|██████████| 231164/231164 [02:34<00:00, 1499.75it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 1781.55it/s]


Results Epoch: 10
AER:
	 validation:	0.3616619452313503
Log Likelihood:
	 train:		-2390.6066612801187
	 validation:	-1333.335899558221
Perplexity:
	 train:		1.0005190714161012
	 validation:	6.355286345578848


AlignI: 100%|██████████| 447/447 [00:00<00:00, 2464.37it/s]


TEST AER RESULT: 0.352063734214764


#### Diana Runs

In [None]:
t = em_algorithm(model=IBM_model.II,initial_method=Initialization_type.random, max_epoch=5, calc_LL_train=False, save_prefix='modelII_init_random3',
                train_file_f='NLP2/NLP2-Projects/Project1/data/training/hansards.36.2.f',
train_file_e='NLP2/NLP2-Projects/Project1/data/training/hansards.36.2.e',
validation_file_f='NLP2/NLP2-Projects/Project1/data/validation/dev.f',
validation_file_e='NLP2/NLP2-Projects/Project1/data/validation/dev.e',
validation_truth='NLP2/NLP2-Projects/Project1/data/validation/dev.wa.nonullalign',
test_file_f = 'NLP2/NLP2-Projects/Project1/data/testing/test/test.f',
test_file_e = 'NLP2/NLP2-Projects/Project1/data/testing/test/test.e',
test_truth = 'NLP2/NLP2-Projects/Project1/data/testing/answers/test.wa.nonullalign',
pickles_path='NLP2/NLP2-Projects/Project1/data/pickles/',
align_path='NLP2/NLP2-Projects/Project1/data/alignments/')

creating pairs


Count Words: 100%|████████████████████| 231164/231164 [04:33<00:00, 846.04it/s]
Pairs: 100%|██████████████████████████| 231164/231164 [04:43<00:00, 814.17it/s]
AlignII: 100%|█████████████████████████████████| 37/37 [00:00<00:00, 45.48it/s]
Calc LL: 100%|████████████████████████████████| 37/37 [00:00<00:00, 171.19it/s]


Results Epoch: Init
AER:
	 validation:	0.896551724137931
Log Likelihood:
	 train:		-1
	 validation:	-3866.965145717926
Perplexity:
	 train:		-1
	 validation:	213.43574923865765
start epoch: 1


E-step:  93%|██████████████████████▎ | 215201/231164 [1:51:38<08:16, 32.13it/s]

# Trained  models evaluation

In [50]:
file_path = 'data/pickles/modelI_report_t_epoch'
file_path_jump = 'data/pickles/modelII_report_ibm1_jump_dist_epoch'
save_prefix='modelI_just_evaluation_'
max_epochs = 10
max_jump = 100
model = IBM_model.I
initial_method=Initialization_type.uniform
file_enc='utf-8'
validation_file_f='data/validation/dev.f' 
validation_file_e='data/validation/dev.e'
train_file_f='data/training/hansards.36.2.f'
train_file_e='data/training/hansards.36.2.e'
test_file_f = 'data/testing/test/test.f'
test_file_e = 'data/testing/test/test.e'

with open(train_file_f, encoding=file_enc) as f:
    train_file_f = f.readlines()
with open(train_file_e, encoding=file_enc) as f:
    train_file_e = f.readlines()
with open(validation_file_f, encoding=file_enc) as f:
    validation_file_f = f.readlines()
with open(validation_file_e, encoding=file_enc) as f:
    validation_file_e = f.readlines()
with open(test_file_f, encoding=file_enc) as f:
    test_file_f = f.readlines()
with open(test_file_e, encoding=file_enc) as f:
    test_file_e = f.readlines()

In [51]:
pairs, vocab_f, vocab_e = create_pairs_and_update_files(train_file_f, train_file_e)
update_files(vocab_f, vocab_e, validation_file_f, validation_file_e)
update_files(vocab_f, vocab_e, test_file_f, test_file_e) 


Count Words:   0%|          | 0/231164 [00:00<?, ?it/s][A
Count Words:   0%|          | 390/231164 [00:00<01:00, 3845.57it/s][A


creating pairs


Count Words:   0%|          | 703/231164 [00:00<01:05, 3493.21it/s][A
Count Words:   1%|          | 1276/231164 [00:00<00:54, 4212.61it/s][A
Count Words:   1%|          | 1664/231164 [00:00<00:55, 4133.72it/s][A
Count Words:   1%|          | 1986/231164 [00:00<01:03, 3635.10it/s][A
Count Words:   1%|          | 2278/231164 [00:00<01:11, 3217.93it/s][A
Count Words:   1%|          | 2538/231164 [00:00<01:12, 3139.46it/s][A
Count Words:   1%|          | 2871/231164 [00:00<01:12, 3160.35it/s][A
Count Words:   1%|▏         | 3208/231164 [00:01<01:11, 3168.56it/s][A
Count Words:   2%|▏         | 3506/231164 [00:01<01:12, 3134.72it/s][A
Count Words:   2%|▏         | 3888/231164 [00:01<01:11, 3183.28it/s][A
Count Words:   2%|▏         | 4219/231164 [00:01<01:11, 3192.83it/s][A
Count Words:   2%|▏         | 4539/231164 [00:01<01:11, 3169.29it/s][A
Count Words:   2%|▏         | 5175/231164 [00:01<01:06, 3377.35it/s][A
Count Words:   2%|▏         | 5583/231164 [00:01<01:07, 3349.03it

In [53]:
# ttemp = pickle.load(open( "data/pickles/modelI_report_t_epoch10.p", "rb" ) )
# ttemp, jump_dist = init_params_modelI(initial_method, pairs, max_jump, ttemp)
ttemp = init_params_modelI(initial_method, pairs)

tracker = Metrics_tracker(save_prefix=save_prefix,
                          align_path='data/alignments/', 
                          validation_truth='data/validation/dev.wa.nonullalign', 
                          validation_file_f=validation_file_f, 
                          validation_file_e=validation_file_e,
                          train_file_f=train_file_f,
                          train_file_e=train_file_e,
                          test_truth = 'data/testing/answers/test.wa.nonullalign',
                          test_file_f=test_file_f,
                          test_file_e=test_file_e,
                          vocab_f=vocab_f,
                          vocab_e=vocab_e,
                          calc_LL_train=True,
                          file_enc='utf-8')
tracker.track_metrics(0, model, ttemp, jump_dist if model == IBM_model.II else None, 
                                       max_jump if model == IBM_model.II else None)
tracker.print_last_metrics('init')
for epoch in range(1,max_epochs+1):
    path = file_path+str(epoch)+str('.p')
    if model == IBM_model.II:
        path_jump = file_path_jump+str(epoch)+str('.p')
        jump_dist = pickle.load(open( path_jump, "rb" ) )
    ttemp = pickle.load(open( path, "rb" ) )
    tracker.track_metrics(0, model, ttemp, jump_dist if model == IBM_model.II else None, 
                                           max_jump if model == IBM_model.II else None)
    tracker.print_last_metrics(epoch)

Init Norm: 100%|██████████| 11614796/11614796 [00:08<00:00, 1358851.04it/s]
AlignI: 100%|██████████| 37/37 [00:00<00:00, 2189.99it/s]
Calc LL: 100%|██████████| 231164/231164 [01:21<00:00, 2834.70it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 3091.36it/s]


Results Epoch: init
AER:
	 validation:	0.9065155807365439
Log Likelihood:
	 train:		-95356501.9183324
	 validation:	-14695.750330280907
Perplexity:
	 train:		976363281.1010088
	 validation:	711193698.5401267


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2647.24it/s]
Calc LL: 100%|██████████| 231164/231164 [01:37<00:00, 2380.25it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2196.19it/s]


Results Epoch: 1
AER:
	 validation:	0.8016997167138811
Log Likelihood:
	 train:		-33126983.437462915
	 validation:	-5019.118429964895
Perplexity:
	 train:		1327.4057958289181
	 validation:	1055.0350904742213


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2689.54it/s]
Calc LL: 100%|██████████| 231164/231164 [01:40<00:00, 2306.80it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2696.36it/s]


Results Epoch: 2
AER:
	 validation:	0.49008498583569404
Log Likelihood:
	 train:		-29832286.615824815
	 validation:	-4560.100115089553
Perplexity:
	 train:		649.2327344497609
	 validation:	558.1836851400658


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2544.46it/s]
Calc LL: 100%|██████████| 231164/231164 [01:35<00:00, 2418.12it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2818.19it/s]


Results Epoch: 3
AER:
	 validation:	0.4230406043437205
Log Likelihood:
	 train:		-28397788.705085233
	 validation:	-4371.703901865533
Perplexity:
	 train:		475.51581487640476
	 validation:	429.82992206703176


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2577.08it/s]
Calc LL: 100%|██████████| 231164/231164 [01:41<00:00, 2275.10it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2481.00it/s]


Results Epoch: 4
AER:
	 validation:	0.39565627950897075
Log Likelihood:
	 train:		-27844254.87926536
	 validation:	-4309.379798223086
Perplexity:
	 train:		421.678323675536
	 validation:	394.23548267551024


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2755.05it/s]
Calc LL: 100%|██████████| 231164/231164 [01:34<00:00, 2442.79it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2641.02it/s]


Results Epoch: 5
AER:
	 validation:	0.38243626062322944
Log Likelihood:
	 train:		-27583278.15624213
	 validation:	-4287.903870930768
Perplexity:
	 train:		398.45385432505185
	 validation:	382.6658276299712


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2546.55it/s]
Calc LL: 100%|██████████| 231164/231164 [01:46<00:00, 2179.66it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2424.30it/s]


Results Epoch: 6
AER:
	 validation:	0.3729933899905571
Log Likelihood:
	 train:		-27438381.168903857
	 validation:	-4281.565249147855
Perplexity:
	 train:		386.1162283604535
	 validation:	379.3163921318975


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2752.02it/s]
Calc LL: 100%|██████████| 231164/231164 [01:37<00:00, 2364.31it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2673.37it/s]


Results Epoch: 7
AER:
	 validation:	0.3682719546742209
Log Likelihood:
	 train:		-27364999.099623423
	 validation:	-4284.7645585559085
Perplexity:
	 train:		380.01440123519365
	 validation:	381.003281100288


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2453.90it/s]
Calc LL: 100%|██████████| 231164/231164 [01:44<00:00, 2213.31it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2420.78it/s]


Results Epoch: 8
AER:
	 validation:	0.36449480642115206
Log Likelihood:
	 train:		-27328621.777428117
	 validation:	-4293.577126977958
Perplexity:
	 train:		377.02541788527014
	 validation:	385.6887470101314


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2501.00it/s]
Calc LL: 100%|██████████| 231164/231164 [01:39<00:00, 2321.45it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2253.27it/s]


Results Epoch: 9
AER:
	 validation:	0.3616619452313503
Log Likelihood:
	 train:		-27317140.41389409
	 validation:	-4303.960298904077
Perplexity:
	 train:		376.0869286112625
	 validation:	391.2832647331653


AlignI: 100%|██████████| 37/37 [00:00<00:00, 2581.15it/s]
Calc LL: 100%|██████████| 231164/231164 [01:46<00:00, 2162.27it/s]
Calc LL: 100%|██████████| 37/37 [00:00<00:00, 2248.89it/s]

Results Epoch: 10
AER:
	 validation:	0.3616619452313503
Log Likelihood:
	 train:		-27321382.344021343
	 validation:	-4313.795797299753
Perplexity:
	 train:		376.43339253109247
	 validation:	396.65751584680845





In [54]:
tracker.save_metrics('data/pickles/modelII_evaluation_metrics_ibm1.p')