## Importing the Dependencies

In [None]:
import random
import numpy as np

import trax
from trax import layers as tl
from trax.fastmath import numpy as fastnp
from trax.supervised import training

## Preprocessing the Data for Training

### Downloading the Data

In [None]:
train_stream_fn = trax.data.TFDS('opus/medical',
                                 data_dir='./data/',
                                 keys=('en', 'de'),
                                 eval_holdout_size=0.01, # 1% for eval
                                 train=True
                                )


eval_stream_fn = trax.data.TFDS('opus/medical',
                                data_dir='./data/',
                                keys=('en', 'de'),
                                eval_holdout_size=0.01, # 1% for eval
                                train=False
                               )



In [None]:
train_stream = train_stream_fn()

eval_stream = eval_stream_fn()

In [None]:
print(next(train_stream))
print(next(eval_stream))

(b'In the pregnant rat the AUC for calculated free drug at this dose was approximately 18 times the human AUC at a 20 mg dose.\n', b'Bei tr\xc3\xa4chtigen Ratten war die AUC f\xc3\xbcr die berechnete ungebundene Substanz bei dieser Dosis etwa 18-mal h\xc3\xb6her als die AUC beim Menschen bei einer 20 mg Dosis.\n')
(b'Subcutaneous use and intravenous use.\n', b'Subkutane Anwendung und intraven\xc3\xb6se Anwendung.\n')


### Tokenizing the Data

In [None]:
# Tokenizing our Data
VOCAB_FILE = 'ende_32k.subword'
VOCAB_DIR = 'data/'

tokenized_train_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(train_stream)
tokenized_eval_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(eval_stream)

In [None]:
##Appending EOS token at the end of the Sentence
EOS = 1

def append_eos(stream):
    for (inputs, targets) in stream:
        inputs_with_eos = list(inputs) + [EOS]
        targets_with_eos = list(targets) + [EOS]
        yield np.array(inputs_with_eos), np.array(targets_with_eos)

In [None]:
tokenized_train_stream = append_eos(tokenized_train_stream)
tokenized_eval_stream = append_eos(tokenized_eval_stream)

### Filtering out too long Sentences

In [None]:
filtered_train_stream = trax.data.FilterByLength(max_length=512, length_keys=[0, 1])(tokenized_train_stream)
filtered_eval_stream = trax.data.FilterByLength(max_length=512, length_keys=[0, 1])(tokenized_eval_stream)

In [None]:
train_input, train_target = next(filtered_train_stream)
print(train_input)
print(train_target)

[ 8569  4094  2679 32826 22527     5 30650  4729   992     1]
[12647 19749    70 32826 10008     5 30650  4729   992     1]


### Helper Functions for Tokenizing and Detokenizing the Input

In [None]:
##Let's create functions to tokenize and detokenize the Sentences or Sequences

In [None]:
def tokenize(sentence, vocab_file=None, vocab_dir=None):

    ##Setting the EOS token Encoding
    EOS = 1

    # we get around it by making a 1-element stream with `iter`.
    inputs =  next(trax.data.tokenize(iter([sentence]),
                                      vocab_file=vocab_file, vocab_dir=vocab_dir))

    inputs = list(inputs) + [EOS]

    # Adding the batch dimension to the front of the shape
    batch_inputs = np.reshape(np.array(inputs), [1, -1])

    return batch_inputs

In [None]:
def detokenize(tokenized_sentence, vocab_file=None, vocab_dir=None):

    ##Removing the dimensions of size 1
    tokenized_sentence = list(np.squeeze(tokenized_sentence))

    EOS = 1

    if EOS in tokenized_sentence:
        tokenized_sentence = tokenized_sentence[:tokenized_sentence.index(EOS)]

    return trax.data.detokenize(tokenized_sentence, vocab_file=vocab_file, vocab_dir=vocab_dir)

In [None]:
##Let's test these functions
print(detokenize(train_input, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(detokenize(train_target, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))

Decreased Appetite

Verminderter Appetit



In [None]:
print(tokenize('hello', vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))

[[17332   140     1]]


### Getting the Data into Batches

In [None]:
##Now let's create stream of batches

In [None]:
boundaries =  [8,   16,  32, 64, 128, 256, 512]
batch_sizes = [256, 128, 64, 32, 16, 8, 4, 2]

In [None]:
# Creating the generators.
train_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes, length_keys=[0, 1])(filtered_train_stream)

eval_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes, length_keys=[0, 1])(filtered_eval_stream)

### Add Padding for all the Sequences to have same length

In [None]:
##Add padding
train_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream)
eval_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream)

In [None]:
input_batch, target_batch, mask_batch = next(train_batch_stream)

print("input_batch data type: ", type(input_batch))
print("target_batch data type: ", type(target_batch))

print("input_batch shape: ", input_batch.shape)
print("target_batch shape: ", target_batch.shape)

input_batch data type:  <class 'numpy.ndarray'>
target_batch data type:  <class 'numpy.ndarray'>
input_batch shape:  (32, 64)
target_batch shape:  (32, 64)


## Creating the Model

### Now we will create the Model for training. The Model will be consisting of an Encoder and two Decoders.
#### The Encoder will take the Inputs and outputs the Keys and Values.
#### The first is the Pre-Attention Decoder, it will take in targets and outputs the Queries. It will also make use of Teacher Forcing and the targets will be shifted to the right.
#### The Keys, Values and Queries will then be passed to the Attention layer to get the Output/Context Vectors.
#### Then these will be passed to another Decoder, that will give the final output translation.

In [None]:
##let's implement the Attention Mechanism and create our model

### Defining the Encoder

In [None]:
def input_encoder_fn(input_vocab_size, d_model, n_encoder_layers):

    # creating a serial network
    input_encoder = tl.Serial(

        # create an embedding layer to convert tokens to vectors
        tl.Embedding(input_vocab_size, d_model),

        # feed the embeddings to the LSTM layers.
        [tl.LSTM(n_units = d_model) for _ in range(n_encoder_layers)]
    )

    return input_encoder

### Defining the Pre-Attention Decoder

In [None]:
def pre_attention_decoder_fn(mode, target_vocab_size, d_model):

    ##create a serial network
    pre_attention_decoder = tl.Serial(

        ##shift right to insert start-of-sentence token and implement teacher forcing during training
        tl.ShiftRight(),

        ##running an embedding layer to convert tokens to vectors
        tl.Embedding(target_vocab_size, d_model),

        ##feed it to an LSTM layer
        tl.LSTM(n_units = d_model)
    )

    return pre_attention_decoder

### Preparing the Inputs (Queries, Keys and Values) for the Attention Layer

In [None]:
##Now let's prepare inputs for the Attention

In [None]:
def prepare_attention_input(encoder_activations, decoder_activations, inputs):

    # set the keys and values to the encoder activations
    keys = encoder_activations
    values = encoder_activations

    # set the queries to the decoder activations
    queries = decoder_activations

    # generate the mask to distinguish real tokens from padding
    mask = (inputs != 0)

    # add axes to the mask for attention heads and decoder length.
    mask = fastnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))

    # broadcast so mask shape is [batch size, attention heads, decoder-len, encoder-len].
    mask = mask + fastnp.zeros((1, 1, decoder_activations.shape[1], 1))  ## number of attention heads are 1.

    return queries, keys, values, mask

### Creating our Model
- Encoder and Pre-Attention Decoder will run in Parallel.
- There will be a Residual Connection in the Attention Layer.
- 'Select' is used for removing masks before passing the output vectors to the final Decoder for translation and also for copying inputs and targets in the beginning.
- 'LogSoftmax' is for log Probabilities.

In [None]:
def NMTAttn(input_vocab_size=33300,
            target_vocab_size=33300,
            d_model=1024,
            n_encoder_layers=2,
            n_decoder_layers=2,
            n_attention_heads=4,
            attention_dropout=0.0,
            mode='train'):


    #creating layers for the input encoder
    input_encoder = input_encoder_fn(input_vocab_size, d_model, n_encoder_layers)

    #creating layers for the pre-attention decoder
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)

    model = tl.Serial(

      #copy input tokens and target tokens as they will be needed later.
      tl.Select([0, 1, 0, 1]),

      #run input encoder on the input and pre-attention decoder the target in Parallel.
      tl.Parallel(input_encoder, pre_attention_decoder),

      #prepare queries, keys, values and mask for attention.
      tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4),

      #run the AttentionQKV layer
      #nest it inside a Residual layer to add to the pre-attention decoder activations(i.e. queries)
      tl.Residual(tl.AttentionQKV(d_model, n_heads=n_attention_heads, dropout=attention_dropout, mode='train')),

      #drop the attention mask
      tl.Select([0, 2]),

      #run the rest of the RNN decoder
      [tl.LSTM(n_units = d_model) for _ in range(n_decoder_layers)],

      tl.Dense(target_vocab_size),

      #for output
      tl.LogSoftmax()
    )

    return model

In [None]:
# print your model
model = NMTAttn()
print(model)

Serial_in2_out2[
  Select[0,1,0,1]_in2_out4
  Parallel_in2_out2[
    Serial[
      Embedding_33300_1024
      LSTM_1024
      LSTM_1024
    ]
    Serial[
      Serial[
        ShiftRight(1)
      ]
      Embedding_33300_1024
      LSTM_1024
    ]
  ]
  PrepareAttentionInput_in3_out4
  Serial_in4_out2[
    Branch_in4_out3[
      None
      Serial_in4_out2[
        _in4_out4
        Serial_in4_out2[
          Parallel_in3_out3[
            Dense_1024
            Dense_1024
            Dense_1024
          ]
          PureAttention_in4_out2
          Dense_1024
        ]
        _in2_out2
      ]
    ]
    Add_in2
  ]
  Select[0,2]_in3_out2
  LSTM_1024
  LSTM_1024
  Dense_33300
  LogSoftmax
]


In [None]:
##Ok so now our model is Created.

## Training the Model

In [None]:
##Let's train it

### Defining the Training Task and Eval Task for training

In [None]:
def train_task_function(train_batch_stream):

    return training.TrainTask(

        labeled_data = train_batch_stream,

        loss_layer = tl.CrossEntropyLoss(),

        optimizer = trax.optimizers.Adam(0.01),

        lr_schedule = trax.lr.warmup_and_rsqrt_decay(1000, 0.01),

        ##checkpoint every 10 steps
        n_steps_per_checkpoint = 10
    )

In [None]:
##Instantiating the Training Task
train_task = train_task_function(train_batch_stream)

In [None]:
##Let's also define the eval task
eval_task = training.EvalTask(

    labeled_data=eval_batch_stream,
    metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
)

In [None]:
##Let's start the training, for that we need to create a train loop, that is very easy using trax.

In [None]:
output_dir = 'output_dir/'


# defining the training loop
training_loop = training.Loop(NMTAttn(mode='train'),
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

### Start the Training

In [None]:
##Now we can just start training by writing
training_loop.run(10)

In [None]:
##It takes a long time to run, so let's just load a pre-trained Model

## Testing the Model and translating the text

In [None]:
import os
model = os.path.join('Files', 'output_dir', 'model.pkl.gz')

In [None]:
#Instantiating the model we built in eval mode
model = NMTAttn(mode='eval')

##initializing weights from a pre-trained model
model.init_from_file('/content/Files/output_dir/model.pkl.gz', weights_only=True)
model = tl.Accelerate(model)

### Helper Function to get the next word in Translation

In [None]:
def next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature):


    #Getting the length of the current output tokens
    token_length = len(cur_output_tokens)

    #calculating the padding length
    padded_length = np.power(2, int(np.ceil(np.log2(token_length + 1))))

    #pad cur_output_tokens up to the padded_length
    padded = cur_output_tokens + [0] * (padded_length - token_length)

    # model expects the output to have an axis for the batch size in front so
    padded_with_batch = np.array(padded).reshape(1, padded_length)

    ##model prediction
    output, _ = NMTAttn((input_tokens, padded_with_batch))

    ##get log probabilities slice for the next token
    log_probs = output[0, token_length, : ]

    ##get the next symbol/word by getting a logsoftmax sample
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))

    return symbol, float(log_probs[symbol])

### Helper Function to translate the whole Sentence.
#### It will make use of the function we defined above for getting the next translated word and translate the whole sentence or sequence until the end of sentence (EOS) token.

In [None]:
def sampling_decode(input_sentence, NMTAttn = None, temperature=0.0, vocab_file=None, vocab_dir=None, next_symbol=next_symbol, tokenize=tokenize, detokenize=detokenize):

    ##Tokenizing the input sentence
    input_tokens = tokenize(input_sentence, vocab_file, vocab_dir)

    ##Initializing an empty the list of output tokens
    cur_output_tokens = []

    # initialize an integer that represents the current output
    cur_output = 0

    # Set the encoding of the "end of sentence"
    EOS = 1

    while cur_output != EOS:

        ##updating the current output token by getting the index of the next word
        cur_output, log_prob = next_symbol(NMTAttn, input_tokens, cur_output_tokens, temperature)

        ##append the current output token to the list of output tokens
        cur_output_tokens.append(cur_output)

    ##detokenize the output tokens
    sentence = detokenize(cur_output_tokens, vocab_file, vocab_dir)

    return cur_output_tokens, log_prob, sentence

In [None]:
sampling_decode("I love languages.", NMTAttn=model, temperature=0.0, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)

### Greedy Decode to get the Most Porbable next word.

In [None]:
##Let's use the Greedy Decode Method

In [None]:
def greedy_decode_test(sentence, NMTAttn=None, vocab_file=None, vocab_dir=None, sampling_decode=sampling_decode, next_symbol=next_symbol, tokenize=tokenize, detokenize=detokenize):

    _,_, translated_sentence = sampling_decode(sentence, NMTAttn=NMTAttn, vocab_file=vocab_file, vocab_dir=vocab_dir, next_symbol=next_symbol, tokenize=tokenize, detokenize=detokenize)

    print("English: ", sentence)
    print("German: ", translated_sentence)

    return translated_sentence

In [None]:
#put a custom string here
your_sentence = 'My name is Piyush.'

greedy_decode_test(your_sentence, NMTAttn=model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR);

English:  My name is Piyush.
German:  Ich heiße Piyush.


In [None]:
greedy_decode_test('You are almost done with the assignment!', model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR);

English:  You are almost done with the assignment!
German:  Sie sind fast mit der Aufgabe fertig!


<a name="4-2"></a>
### Implement Minimum Bayes-Risk Decoding

Now getting the most probable token at each step may not necessarily produce the best results.
Another approach and the more efficient one is to do Minimum Bayes Risk Decoding or MBR.
We can implement it like this:
- take several random samples (samples are our translated sequences)
- score each sample against all other samples
- select the one with the highest score

In [None]:
##Now let's use Minimum Bayes Risk to decode our Sample or translation

In [None]:
##let's create a function to generate samples

### Helper Function for Generating Samples

In [None]:
def generate_samples(sentence, n_samples, NMTAttn=None, temperature=0.6, vocab_file=None, vocab_dir=None, sampling_decode=sampling_decode, next_symbol=next_symbol, tokenize=tokenize, detokenize=detokenize):

    #lists to contain samples and probabilities
    samples, log_probs = [], []

    for _ in range(n_samples):

        # get a sample using the sampling_decode() function
        sample, logp, _ = sampling_decode(sentence, NMTAttn, temperature, vocab_file=vocab_file, vocab_dir=vocab_dir, next_symbol=next_symbol)

        # append the token list to the samples list
        samples.append(sample)

        # append the log probability to the log_probs list
        log_probs.append(logp)

    return samples, log_probs

In [None]:
generate_samples('how are you today?', 4, model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)

([[7906, 644, 21, 352, 102, 1],
  [595, 75, 67, 352, 102, 1],
  [595, 119, 67, 352, 102, 1],
  [595, 24, 34, 352, 102, 1]],
 [-0.0004901885986328125,
  -3.814697265625e-06,
  -1.33514404296875e-05,
  -7.62939453125e-06])

<a name='4-2-2'></a>
### Helper Function to Compare the Overlaps

Let us now build our functions to compare a sample against another. We will be calculating scores for unigram overlaps. One of the more simple metrics is the *Jaccard* similarity which gets the intersection over union of two sets.

In [None]:
##Now let's compare the generated samples against eachother.

### Implementing Jaccard Similarity

In [None]:
def jaccard_similarity(candidate, reference):

    #convert the lists to a set to get the unique tokens
    can_unigram_set, ref_unigram_set = set(candidate), set(reference)

    #set of tokens common to both candidate and reference
    joint_elems = can_unigram_set.intersection(ref_unigram_set)

    #set of all tokens found in either candidate or reference
    all_elems = can_unigram_set.union(ref_unigram_set)

    #divide the number of joint elements by the number of all elements
    overlap = len(joint_elems) / len(all_elems)

    return overlap

In [None]:
# let's try using the function. remember the result here and compare with the next function below.
jaccard_similarity([1, 2, 3], [1, 2, 3, 4])

0.75

### Calculating the Overlap

In [None]:
def average_overlap(similarity_fn, samples, *ignore_params):
    ##Returns the arithmetic mean of each candidate sentence in the samples

    # initialize dictionary
    scores = {}

    # run a for loop for each sample
    for index_candidate, candidate in enumerate(samples):

        # initialize overlap
        overlap = 0.0

        # run a for loop for each sample
        for index_sample, sample in enumerate(samples):

            # skip if the candidate index is the same as the sample index
            if index_candidate == index_sample:
                continue

            # get the overlap between candidate and sample using the similarity function
            sample_overlap = similarity_fn(candidate, sample)

            # add the sample overlap to the total overlap
            overlap += sample_overlap

        # get the score for the candidate by computing the average
        score = overlap / index_sample

        # save the score in the dictionary. use index as the key.
        scores[index_candidate] = score

    return scores

### Using all the Functions above and Implementing MBR Decoding

In [None]:
def mbr_decode(sentence, n_samples, average_overlap=average_overlap, jaccard_similarity=jaccard_similarity, NMTAttn=None, temperature=0.6, vocab_file=None, vocab_dir=None, generate_samples=generate_samples, sampling_decode=sampling_decode, next_symbol=next_symbol, tokenize=tokenize, detokenize=detokenize):

    #generating the samples
    samples, log_probs = generate_samples(sentence, n_samples, NMTAttn, temperature, vocab_file, vocab_dir)

    # use the scoring function to get a dictionary of scores
    scores = average_overlap(jaccard_similarity, samples, log_probs)

    # find the key with the highest score
    max_score_key = max(scores, key = scores.get)

    # detokenize the token list associated with the max_score_key
    translated_sentence = detokenize(samples[max_score_key], vocab_file, vocab_dir)

    ### END CODE HERE ###

    return (translated_sentence, max_score_key, scores)

In [None]:
TEMPERATURE = 1.0

# put a custom string here
your_sentence = 'She speaks English and German.'

In [None]:
mbr_decode(your_sentence, 4, average_overlap, jaccard_similarity, model, TEMPERATURE, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)[0]

'Sie spricht Englisch und Deutsch.'