In [None]:
import copy
import pickle
import pdb
import sys
sys.path.append("..")
import os

import numpy as np
import tensorflow as tf
from tqdm import tqdm

from utils import utils
from utils.metrics import BleuScore
from models.transformer import Transformer

In [2]:
# paths
data_dir = '/project/cq-training-1/project2/teams/team12/data/'
best_model_path = '/project/cq-training-1/project2/submissions/team12/low-resource-translation/saved_model/Transformer-num_layers_2-d_model_128-num_heads_8-dff_512_fr_to_en_False_embedding_None_embedding_dim_128_back_translation_True_ratio_4.0'
path_en = os.path.join(data_dir, 'train.lang1')
path_fr = os.path.join(data_dir, 'train.lang2')


# Create vocabs
word2idx_en, idx2word_en = utils.create_vocab(path_en, vocab_size=None)
word2idx_fr, idx2word_fr = utils.create_vocab(path_fr, vocab_size=None)

In [3]:
valid_dataset = utils.load_training_data(path_en, path_fr, word2idx_en, word2idx_fr, seq_len=150, batch_size=64)[1]

In [4]:
# Additional methods 
def get_next(self,x,y):
    preds = self.forward(x, y, training=False)
    return preds[:,-1,:], preds

def update_state(self, y_true, y_pred, vocab, idx=False):
    for i in range(len(y_true)):
        label_sentence = utils.generate_sentence(y_true[i].numpy().astype('int'), vocab)
        if idx: pred_sentence = utils.generate_sentence(y_pred[i], vocab) 
        else:   pred_sentence = utils.generate_sentence_from_probabilities(y_pred[i].numpy(), vocab)
        self.total_score += sacrebleu.sentence_bleu(pred_sentence, label_sentence, smooth_method='exp').score
        self.total_num_examples += 1

In [5]:
Transformer.get_next = get_next
BleuScore.update_state = update_state

In [6]:
# Load model
model_config = {'num_layers': 2, 'd_model': 128, 'dff': 512, 'num_heads': 8}
model = Transformer(model_config, len(word2idx_en), word2idx_fr)
model.load_weights(os.path.join(best_model_path, "model"))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2ad323081390>

In [7]:
def tf_beam_search(model,batch,steps,width,word2idx_fr,bonus=0):
    
    x,bs = batch['inputs'], batch['inputs'].shape[0] 
    # adjust for start
    hist = [np.ones((1,bs,1),dtype=np.int32)* model.start_token] 
    for i in range(1,steps): 
        # length of input = i+1 at each timestep
        hist.append(np.ones((width,bs,i+1),dtype=np.int32)) 
    hist_probs = np.zeros((steps,width,bs),dtype=np.float32)
    flag = False
    for i in range(1,steps): # loop over steps
        wid = hist[i-1].shape[0] # adjust for start
        # total candidates = width*width at each timestep
        cand, cand_probs = np.zeros((bs,wid*width,i+1),dtype=np.int32), np.zeros((bs,wid*width),dtype=np.float32) 
        for j in range(wid): # loop over width elements
            
            temp_idx = np.nonzero(hist[i-1][j] == word2idx_fr['<end>'])[0]
            if temp_idx.size > 0:
                hist_probs[i-1,j][temp_idx] += bonus*(i-1)
            
            # output of prev step is current input step 
            curr, curr_probs = tf.convert_to_tensor(hist[i-1][j]), tf.convert_to_tensor(hist_probs[i-1,j])
            
            temp_idx = np.flatnonzero(curr[:,-1] == word2idx_fr['<end>']) # check for end 
            if temp_idx.size > 0: # eager tensor does not support item assigment
                temp_var = curr_probs.numpy()
                temp_var[temp_idx] += - 100
                curr_probs = tf.convert_to_tensor(temp_var)
                
            preds,_ = model.get_next(x,curr)
            preds = tf.nn.softmax(preds,-1)
            topk= tf.argsort(preds,axis=-1,direction='DESCENDING')[:,:width] # take top 'width' predictions
            topk_probs = tf.sort(preds,axis=-1,direction='DESCENDING')[:,:width] # take top 'width' probs
            curr = tf.broadcast_to(tf.expand_dims(curr,1),(bs,width,curr.shape[-1])) # bs, width, i
            topk = tf.expand_dims(topk,-1) # shape = bs, width,1
            cand[:,j*width:(j+1)*width] = tf.concat([curr,topk],-1) # next step shape = current_shape + 1 
            cand_probs[:,j*width:(j+1)*width] = curr_probs[:,None] + np.log(topk_probs) # add log probs
        
        cand, cand_probs = tf.convert_to_tensor(cand), tf.convert_to_tensor(cand_probs)
        indices = tf.argsort(cand_probs,axis=-1,direction='DESCENDING')[:,:width] # from candidates = width*width pick width
        value = tf.gather(cand,indices,axis=1,batch_dims=1)
        hist[i] = tf.transpose(value,perm=(1,0,2)).numpy() # store next step inputs
        hist_probs[i] = tf.transpose(tf.gather(cand_probs,indices,axis=-1,batch_dims=1)).numpy() # store probs
    
    return hist, hist_probs 

In [8]:
def get_various_runs(model,batch,word2idx_fr,steps=134,width=5,bonus=0):
    options, probs = tf_beam_search(model,batch,steps,width,word2idx_fr,bonus)
    options,probs = options[1:], probs[1:]
    return options,probs

In [9]:
def process_batch(options,probs,alpha,skip):
    probs = probs / (np.arange(1,150)**alpha).reshape(-1,1,1)
    options,probs = options[skip:], probs[skip:]
    preds = []
    for i in range(probs.shape[2]):
        idx = np.unravel_index(np.argmax(probs[:,:,i]),probs[:,:,i].shape)
        preds.append(options[idx[0]][idx[1],i])
    return preds

In [10]:
# get all valid batches
valid_batches = []
for batch in tqdm(valid_dataset):
    valid_batches.append(batch)

26it [00:00, 99.43it/s]


In [11]:
# Try different hyper-parameters
bonus = [0, 0.05, 0.1, 0.2, 0.3]
alpha = [0.6, 0.7, 0.8, 0.9, 1]

In [None]:
list_options, list_probs =[], []
for b in bonus:
    batch_ops, batch_probs =[], []
    for batch in tqdm(valid_dataset,total = 26):    
        ops,probs = get_various_runs(model,batch,word2idx_fr,bonus=b)
        batch_ops.append(ops)
        batch_probs.append(probs)
        
    list_options.append(batch_ops)
    list_probs.append(batch_probs)

options,probs  = list_options, list_probs

In [None]:
# Evaluate Bleu Score
for i,b in enumerate(bonus):
    for k,alp in enumerate(alpha):
        bleu_beam.reset_states()
        for j,batch_options in enumerate(options[i]):
            batch_probs = probs[i][j]
            preds = process_batch(batch_options,batch_probs,alp,skip=10)
            bleu_beam.update_state(valid_batches[j]['labels'], preds, idx2word_fr, idx = True)
        print(f"Bleu Score for bonus: {b} , Alpha: {alp} = {bleu_beam.result()}")

In [69]:
# See some examples
steps,width = 150, 5
hist, hist_probs = tf_beam_search(model,valid_batches[0],steps,width,word2idx_fr, bonus = 0.4)
hist_probs = hist_probs[1:] / (np.arange(1,steps) ** 0.7).reshape(-1,1,1)
output = model(valid_batches[0])

In [97]:
idx=11
print("Source:")
print(utils.generate_sentence(valid_batches[0]['inputs'][idx].numpy(),idx2word_en))
print("\nTarget:")
print(utils.generate_sentence(valid_batches[0]['labels'][idx].numpy(),idx2word_fr))

Source:
as we know the reduction of regional disparities is one of the fundamental aims of the eu

Target:
Comme nous le savons , la disparition des disparités régionales constitue un des objectifs fondamentaux de l' ue .


In [98]:
print("Greedy Prediction:")
print(utils.generate_sentence_from_probabilities(output[idx],idx2word_fr))

skip = 10
i = np.unravel_index(np.argmax(hist_probs[skip:,:,idx]),hist_probs[skip:,:,idx].shape)
print("\nBeam Search Prediction:")
print(utils.generate_sentence(hist[skip:][i[0]][i[1],idx],idx2word_fr))

Greedy Prediction:
Comme nous le savons la réduction des émissions de conum , il est essentiel de réduire les émissions de conum .

Beam Search Prediction:
Comme nous le savons tous , la réduction des disparités régionales est un des objectifs fondamentaux de l' ue .
