In [1]:
import numpy as np
import pandas as pd 
import os

import re
import time
import io

from torchtext.utils import download_from_url, extract_archive

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 4663715354342735275,
 name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 8526272000179476442
 physical_device_desc: "device: XLA_CPU device"]

In [2]:
url_base = 'https://storage.googleapis.com/haiku-dataset/'
train_urls = ('train.kigo.gz', 'train.haiku.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]

In [3]:
with open('.data/train.kigo', mode='r', encoding='utf-8') as f:
    kigo = f.read()

with open('.data/train.haiku', mode='r', encoding='utf-8') as f:
    haiku = f.read()

In [4]:
kigo = re.sub(r'\.(?=[0-9]|[a-z]|[A-Z])', '.###', kigo)
kigo = re.sub(r'\.###', '', kigo)
kigo = re.sub(r'  +', ' ', kigo)
kigo = kigo.split('\n')

haiku = re.sub(r'\.(?=[0-9]|[a-z]|[A-Z])', '.###', haiku)
haiku = re.sub(r'\.###', '', haiku)
haiku = re.sub(r'  +', ' ', haiku)
haiku = corpus_fr.split('\n')

In [5]:
tokenizer_kigo = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        corpus_en, target_vocab_size=2**13)
tokenizer_haiku = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        corpus_fr, target_vocab_size=2**13)

In [6]:
VOCAB_SIZE_KIGO = tokenizer_en.vocab_size + 2 # 2 extra spaces are for starting and ending of sentence
VOCAB_SIZE_HAIKU = tokenizer_fr.vocab_size + 2

In [7]:
inputs = [[VOCAB_SIZE_KIGO-2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_KIGO-1] for sentence in corpus_en]

outputs = [[VOCAB_SIZE_HAIKU-2] + tokenizer_fr.encode(sentence) + [VOCAB_SIZE_HAIKU-1] for sentence in corpus_fr]

In [8]:

MAX_LENGTH = 20
idx_to_remove = [count for count, sent in enumerate(inputs) if len(sent) > MAX_LENGTH]

for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]
    
idx_to_remove = [count for count, sent in enumerate(outputs) if len(sent) > MAX_LENGTH]

for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

In [9]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)

outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [10]:
BATCH_SIZE = 128
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache() # To increase speed
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [11]:
class PositionalEncoding(layers.Layer):
    '''
    Custom Positional Encoding Class. Inherited from tensorflow.keras.layers.Layer
    '''
    def __init__(self):
        super(PositionalEncoding, self).__init__()
        
    def get_angles(self, pos, i, d_model): # Input shapes -- pos: (seq_length, 1); i: (1, d_model)
        angles = 1 / np.power(10000., (2*(i//2))/np.float32(d_model)) # Angles have even index both for odd and even indices
        return pos * angles # Returns matrix of shape (seq_length, d_model)
    
    def call(self, inputs):
        
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :],
                                 d_model)
        
        angles[:, 0::2] = np.sin(angles[:, 0::2]) # Gives a step-size of 2 to include only even numbers
        angles[:, 1::2] = np.cos(angles[:, 1::2]) # Gives a step-size of 2 to include only odd numbers
        pos_encoding = angles[np.newaxis, ...] # Adding an extra dimension for batching compatibility
        
        return inputs + tf.cast(pos_encoding, tf.float32)

In [12]:
def scaled_dot_product_attention(queries, keys, values, mask):
    '''
    queries: Q Matrix
    keys: K Matrix
    values: V Matrix
    mask: can be used for both look-ahead masking and masking for padded zeroes
    '''
    
    product = tf.matmul(queries, keys, transpose_b=True)
    
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32) # dk value
    
    scaled_product = product / tf.math.sqrt(keys_dim)
    
    if mask is not None:
        scaled_product += (mask * -1e9) # adding a very small number for mask so that softmax value for it becomes zero
        
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)
    
    return attention

In [13]:
class MultiHeadAttention(layers.Layer):
    '''
    Custom Multi-head Attention Class. Inherited from tensorflow.keras.layers.Layer
    nb_proj: Number of projections that matrices should be split into
    '''
    
    def __init__(self, nb_proj):
        super(MultiHeadAttention, self).__init__()
        self.nb_proj = nb_proj
    
    def build(self, input_shape):
        '''
        Runs when object if first time used. Unlike init function, which runs when the object is created
        '''
        
        self.d_model = input_shape[-1] # Input Shape is the shape of Q matrix
        assert self.d_model % self.nb_proj == 0
        
        self.d_proj = self.d_model //self.nb_proj
        
        self.query_lin = layers.Dense(units = self.d_model)
        self.key_lin = layers.Dense(units = self.d_model)
        self.value_lin = layers.Dense(units = self.d_model)
        self.final_lin = layers.Dense(units = self.d_model)
        
    def split_proj(self, inputs, batch_size):
        '''
        Function for splitting input matrix into projections
        inputs: input tensor with shape (batch_size, seq_length, d_model)
        returns a tensor of shape(batch_size, nb_proj, seq_length, d_proj)
        '''
        
        shape = (batch_size,
                 -1,
                 self.nb_proj,
                 self.d_proj)
        # print(shape, tf.shape(inputs)) --Debugging print
        
        splitted_inputs = tf.reshape(inputs, shape=shape) # shape of splitted_inputs: (batch_size, seq_length, nb_proj, d_proj)
        
        return tf.transpose(splitted_inputs, perm=[0, 2, 1, 3])
        
    def call(self, queries, keys, values, mask):
        '''
        queries: Q Matrix
        keys: K Matrix
        values: V Matrix
        mask: can be used for both look-ahead masking and masking for padded zeroes
        '''
        batch_size = tf.shape(queries)[0]
        
        queries = self.query_lin(queries) # Applying Big Linear function to Q Matrix
        keys = self.key_lin(keys) # Applying Big Linear function to K Matrix
        values = self.value_lin(values) # Applying Big Linear function to V Matrix
        
        queries = self.split_proj(queries, batch_size) # Splitting into projections
        keys = self.split_proj(keys, batch_size) # Splitting into projections
        values = self.split_proj(values, batch_size) # Splitting into projections
        
        attention = scaled_dot_product_attention(queries, keys, values, mask)
        
        #Concatinating the splitted projections after attention in reverse process of split_proj function
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention,
                                      shape=(batch_size, -1, self.d_model))
        
        # Applying final Linear function
        outputs = self.final_lin(concat_attention)
        
        return outputs

In [14]:
class EncoderLayer(layers.Layer):
    '''
    Custom Encoder Layer Class. Inherited from tensorflow.keras.layers.Layer
    FFN_units: Feed Forward Network units
    nb_proj: Number of Projections
    dropout: Dropout Rate
    '''
    
    def __init__(self, FFN_units, nb_proj, dropout):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout = dropout
    
    def build(self, input_shape):
        '''
        Runs when object if first time used. Unlike init function, which runs when the object is created
        '''
        self.d_model = input_shape[-1]
        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        self.dense_1 = layers.Dense(units=self.FFN_units, activation='relu')
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, mask, training):
        attention = self.multi_head_attention(inputs,
                                              inputs,
                                              inputs,
                                              mask)
        
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)
        
        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs)
        outputs = self.norm_2(outputs + attention)
        
        return outputs

In [15]:
class Encoder(layers.Layer):
    '''
    Custom Encoder Class. Inherited from tensorflow.keras.layers.Layer
    nb_layers: Number of layers of encoders
    FFN_units: Feed Forward Network units
    nb_proj: Number of Projections
    dropout: Dropout Rate
    vocab_size: Vocabulary Size
    d_model: last dimension of input matrix
    '''
    
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name='encoder'):
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        self.enc_layers = [EncoderLayer(FFN_units,
                                        nb_proj,
                                        dropout)
                           for _ in range(nb_layers)]
        
    def call(self, inputs, mask, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask, training)
        
        return outputs

In [16]:
class DecoderLayer(layers.Layer):
    '''
    Custom Decoder Layer Class. Inherited from tensorflow.keras.layers.Layer
    FFN_units: Feed Forward Network units
    nb_proj: Number of Projections
    dropout: Dropout Rate
    '''
    
    def __init__(self, FFN_units, nb_proj, dropout):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout = dropout
    
    def build(self, input_shape):
        '''
        Runs when object if first time used. Unlike init function, which runs when the object is created
        '''
        self.d_model = input_shape[-1]
        
        # Layers for Phase I
        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        # Layers for Phase II
        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
        # Layers for Phase III
        self.dense_1 = layers.Dense(units=self.FFN_units, activation='relu')
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(rate=self.dropout)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        
        # Phase I
        attention = self.multi_head_attention_1(inputs,
                                                inputs,
                                                inputs,
                                                mask_1)
        attention = self.dropout_1(attention, training)
        attention = self.norm_1(attention + inputs)
        
        # Phase II
        attention_2 = self.multi_head_attention_2(attention,
                                                enc_outputs,
                                                enc_outputs,
                                                mask_2)
        attention_2 = self.dropout_2(attention_2, training)
        attention_2 = self.norm_2(attention_2 + attention)
        
        # Phase III
        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training)
        outputs = self.norm_3(outputs + attention_2)
        
        return outputs

In [17]:
class Decoder(layers.Layer):
    '''
    Custom Decoder Class. Inherited from tensorflow.keras.layers.Layer
    nb_layers: Number of layers of decoders
    FFN_units: Feed Forward Network units
    nb_proj: Number of Projections
    dropout: Dropout Rate
    vocab_size: Vocabulary Size
    d_model: last dimension of input matrix
    '''
    
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name='decoder'):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.nb_layers = nb_layers
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        
        self.dec_layers = [DecoderLayer(FFN_units,
                                        nb_proj,
                                        dropout)
                          for _ in range(nb_layers)]
        
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.nb_layers):
            outputs = self.dec_layers[i](outputs,
                                         enc_outputs,
                                         mask_1,
                                         mask_2,
                                         training)
            
        return outputs

In [18]:
class Transformer(tf.keras.Model):
    '''
    Custom Transformer Model Class. Inherited from tensorflow.keras.Model
    nb_layers: Number of layers of Encoders and Decoders
    FFN_units: Feed Forward Network units
    nb_proj: Number of Projections
    dropout: Dropout Rate
    vocab_size_enc: Vocabulary Size of encoder
    vocab_size_dec: Vocabulary Size of decoder
    d_model: last dimension of input matrix
    '''
    
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 name='transformer'):
        super(Transformer, self).__init__(name=name)
        
        self.encoder = Encoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout,
                               vocab_size_enc,
                               d_model)
        
        self.decoder = Decoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout,
                               vocab_size_dec,
                               d_model)
        self.last_linear = layers.Dense(units = vocab_size_dec)
        
    
    def create_padding_mask(self, seq):
        '''
        Function for creating Padding masks
        seq: sequence of numbers post-tokenization of shape (batch_size, seq_length)
        '''
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]
    
    def create_look_ahead_mask(self, seq):
        '''
        Function for creating Look Ahead masks
        '''
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1- tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask
        
    def call(self, enc_inputs, dec_inputs, training):
        
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(
                                self.create_padding_mask(dec_inputs),
                                self.create_look_ahead_mask(dec_inputs))
        dec_mask_2 = self.create_padding_mask(enc_inputs)
        
        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        dec_outputs = self.decoder(dec_inputs,
                                   enc_outputs,
                                   dec_mask_1,
                                   dec_mask_2,
                                   training)
        
        outputs = self.last_linear(dec_outputs)
        
        return outputs

In [19]:
tf.keras.backend.clear_session()

#Hyper-Parameters, with article parameters given as comments
D_MODEL = 128 # 512
NB_LAYERS = 4 # 6
FFN_UNITS = 512 # 2048
NB_PROJ = 8 # 8
DROPOUT = 0.1 # 0.1

In [20]:
transformer = Transformer(vocab_size_enc=VOCAB_SIZE_KIGO,
                          vocab_size_dec=VOCAB_SIZE_HAIKU,
                          d_model=D_MODEL,
                          nb_layers=NB_LAYERS,
                          FFN_units=FFN_UNITS,
                          nb_proj=NB_PROJ,
                          dropout=DROPOUT)

In [21]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')

def loss_function(target, pred):
    '''
    Custom loss function with no reduction and loss for padding tokens is masked to zero
    '''
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [22]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    '''
    Custom Learning Rate Scheduler Class. Inherited from tensorflow.keras.optimizers.schedules.LearningRateSchedule
    warmup_steps: steps till which learning rate is increased linearly and after which it is decreased exponentially
    '''
    
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
        
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)

In [23]:
EPOCHS = 10

for epoch in range(EPOCHS):
    
    print("Start of epoch {}".format(epoch+1))
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]
        
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)
        
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        
        train_loss(loss)
        train_accuracy(dec_outputs_real, predictions)
        
        if batch % 50 == 0:
            print("Epoch: {} Batch: {} Loss: {:.4f} Accuracy: {:.4f}".format(
                epoch + 1, batch, train_loss.result(), train_accuracy.result()))
    
    print("Time taken for epoch {}: {}".format(epoch + 1 , time.time() - start))

Start of epoch 1
Epoch: 1 Batch: 0 Loss: 5.4728 Accuracy: 0.0000
Epoch: 1 Batch: 50 Loss: 5.4832 Accuracy: 0.0047
Epoch: 1 Batch: 100 Loss: 5.4172 Accuracy: 0.0265
Epoch: 1 Batch: 150 Loss: 5.3488 Accuracy: 0.0370
Epoch: 1 Batch: 200 Loss: 5.2841 Accuracy: 0.0458
Epoch: 1 Batch: 250 Loss: 5.2077 Accuracy: 0.0535
Time taken for epoch 1: 253.77749586105347
Start of epoch 2
Epoch: 2 Batch: 0 Loss: 4.6495 Accuracy: 0.0855
Epoch: 2 Batch: 50 Loss: 4.5490 Accuracy: 0.0862
Epoch: 2 Batch: 100 Loss: 4.4561 Accuracy: 0.0875
Epoch: 2 Batch: 150 Loss: 4.3828 Accuracy: 0.0881
Epoch: 2 Batch: 200 Loss: 4.3266 Accuracy: 0.0883
Epoch: 2 Batch: 250 Loss: 4.2811 Accuracy: 0.0887
Time taken for epoch 2: 256.08761048316956
Start of epoch 3
Epoch: 3 Batch: 0 Loss: 3.9982 Accuracy: 0.0921
Epoch: 3 Batch: 50 Loss: 3.9873 Accuracy: 0.0923
Epoch: 3 Batch: 100 Loss: 3.9662 Accuracy: 0.0931
Epoch: 3 Batch: 150 Loss: 3.9480 Accuracy: 0.0942
Epoch: 3 Batch: 200 Loss: 3.9259 Accuracy: 0.0954
Epoch: 3 Batch: 250 Lo

In [24]:
def evaluate(inp_sentence):
    inp_sentence = \
        [VOCAB_SIZE_KIGO-2] + tokenizer_en.encode(inp_sentence) + [VOCAB_SIZE_KIGO-1]
    enc_input = tf.expand_dims(inp_sentence, axis=0)
    
    output = tf.expand_dims([VOCAB_SIZE_HAIKU-2], axis=0)
    
    for _ in range(MAX_LENGTH):
        predictions = transformer(enc_input, output, False) # shape of predictions: (1, seq_length, vocab_size_fr)
        prediction = predictions[:, -1:, :]
        
        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)
        
        if predicted_id == VOCAB_SIZE_HAIKU-1:
            return tf.squeeze(output, axis=0)
        
        output = tf.concat([output, predicted_id], axis=-1)
        
    return tf.squeeze(output, axis=0)
        

In [25]:
def translate(sentence):
    output = evaluate(sentence).numpy()
    
    predicted_sentence = tokenizer_fr.decode(
        [i for i in output if i < VOCAB_SIZE_HAIKU-2]
    )
    
    print("Input: {}".format(sentence))
    print("Predicted translation: {}".format(predicted_sentence))

In [26]:
translate("冬 白鳥 頸")
translate("春 菜の花 明るい")
translate("夏 恋 母")

Input: 冬 白鳥 頸
Predicted translation: 白鳥 の 白鳥 の 中 に 白鳥 か な
Input: 春 菜の花 明るい
Predicted translation: 菜の花 や 菜の花 の 中 に は の 音
Input: 夏 恋 母
Predicted translation: 水 の 日 の 水 に なり し 日 の 恋


In [32]:
translate("冬 風邪")
translate("春 休み")
translate("夏 蚊帳")

Input: 冬 風邪
Predicted translation: 冬 の 風邪 水 に 入る もの の 稰 か な
Input: 春 休み
Predicted translation: 連翹 や 海 の 中 に も ある ところ
Input: 夏 蚊帳
Predicted translation: 鮎 の 中 に 水 の 音 し て いる
