In [1]:
import numpy as np
import sentencepiece as spm
import random
import json
from datasets import load_dataset
from torch.utils.data import DataLoader
import os
import time
import pandas as pd
import tensorflow as tf
import keras
from tensorflow.keras.layers import Bidirectional, LSTM, Input, Dense, TimeDistributed, Embedding, Concatenate
from tensorflow.keras.models import Model
import pickle
import warnings

In [2]:
tf.keras.backend.clear_session()
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
       tf.config.experimental.set_memory_growth(gpu,True)

In [3]:
sp = spm.SentencePieceProcessor()
sp.load("/kaggle/input/nepali-summarization-tokenizer/summarization_50000.model")

True

In [4]:
parameters = {
    'VOC_SIZE': 50_000,

    'ENCODER_LAYERS': 6,
    'DECODER_LAYERS': 6,

    'ENCODER_SEQUENCE_LENGTH': 256,
    'DECODER_SEQUENCE_LENGTH': 12,
    
    'EMBEDDING_DIMENSION': 256,
    'ENCODER_ATTENTION_HEADS': 8,
    'DECODER_ATTENTION_HEADS': 8,
    'ENCODER_FFN_DIM': 4 * 512,
    'DECODER_FFN_DIM': 4 * 512,
    
    'DROPOUT': 0.2,

    'BATCH_SIZE': 64,
    'EPOCHS': 20,
    'EARLY_STOPPING': 4,
    'L2_REG': 0.01,
    
    'LEARNING_RATE': 1e-4,

    'LABEL_SMOOTHING': 0.1,
    'TEACHER_FORCING_RATIO': 0.5,
    'GRAD_CLIP': 1.0,

    'PAD_TOKEN': '<pad>',
    'UNK_TOKEN': '<unk>',
    'SOS_TOKEN': '<s>',
    'EOS_TOKEN': '</s>',

    'PAD_TOKEN_ID': sp.pad_id(),
    'UNK_TOKEN_ID': sp.unk_id(),
    'SOS_TOKEN_ID': sp.bos_id(),
    'EOS_TOKEN_ID': sp.eos_id(),

    'COVERAGE_WEIGHT': 1.0,
}

In [5]:
dataset_files = {
    "train": "/kaggle/input/nepali-summarization-set-cleaned/summarization_set_cleaned_train.csv",
    "test": "/kaggle/input/nepali-summarization-set-cleaned/summarization_set_cleaned_test.csv",
    "eval": "/kaggle/input/nepali-summarization-set-cleaned/summarization_set_cleaned_val.csv"
}

In [6]:
def collate_fn(batch):
    data = {
        'encoder_inputs': [],
        'decoder_inputs': [],
        'decoder_targets': [],
    }

    for row in batch:
        news_encoded = sp.encode(row['news'])
        title_encoded = sp.encode(row['title']) + [sp.eos_id()]
        title_encoded_inp = [sp.bos_id()] + sp.encode(row['title'])

        if len(news_encoded) >= parameters['ENCODER_SEQUENCE_LENGTH']:
            data['encoder_inputs'].append(news_encoded[:parameters['ENCODER_SEQUENCE_LENGTH']])
        else:
            data['encoder_inputs'].append(news_encoded + [sp.pad_id()] * (parameters['ENCODER_SEQUENCE_LENGTH'] - len(news_encoded)))

        if len(title_encoded) >= parameters['DECODER_SEQUENCE_LENGTH']:
            data['decoder_targets'].append(title_encoded[:parameters['DECODER_SEQUENCE_LENGTH']])
        else:
            data['decoder_targets'].append(title_encoded + [sp.pad_id()] * (parameters['DECODER_SEQUENCE_LENGTH'] - len(title_encoded)))
            
        if len(title_encoded_inp) >= parameters['DECODER_SEQUENCE_LENGTH']:
            data['decoder_inputs'].append(title_encoded_inp[:parameters['DECODER_SEQUENCE_LENGTH']])
        else:
            data['decoder_inputs'].append(title_encoded_inp + [sp.pad_id()] * (parameters['DECODER_SEQUENCE_LENGTH'] - len(title_encoded_inp)))

    return data

In [7]:
dataset = load_dataset("csv", data_files=dataset_files, streaming=True)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
eval_dataset = dataset["eval"]

In [8]:
train_dataloader = DataLoader(train_dataset, batch_size=parameters['BATCH_SIZE'], collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=parameters['BATCH_SIZE'], collate_fn=collate_fn)
eval_dataloader = DataLoader(eval_dataset, batch_size=parameters['BATCH_SIZE'], collate_fn=collate_fn)

In [9]:
df_eval = pd.read_csv('/kaggle/input/nepali-summarization-set-cleaned/summarization_set_cleaned_val.csv')

In [10]:
train_inputs = None
train_targets = None
train_dec_inputs = None
for batch in train_dataloader:
    if train_inputs is None:
        train_inputs = batch['encoder_inputs']
        train_targets = batch['decoder_targets']
        train_decoder_inputs = batch['decoder_inputs']
    else:
        train_inputs = train_inputs + batch['encoder_inputs']
        train_targets = train_targets + batch['decoder_targets']
        train_decoder_inputs = train_decoder_inputs + batch['decoder_inputs']
train_inputs = np.array(train_inputs)
train_targets = np.array(train_targets)
train_decoder_inputs = np.array(train_decoder_inputs)

In [11]:
test_inputs = None
test_targets = None
test_decoder_inputs = None
for batch in test_dataloader:
    if test_inputs is None:
        test_inputs = batch['encoder_inputs']
        test_targets = batch['decoder_targets']
        test_decoder_inputs = batch['decoder_inputs']
    else:
        test_inputs = test_inputs + batch['encoder_inputs']
        test_targets = test_targets + batch['decoder_targets']
        test_decoder_inputs = test_decoder_inputs + batch['decoder_inputs']
test_inputs = np.array(test_inputs)
test_targets = np.array(test_targets)
test_decoder_inputs = np.array(test_decoder_inputs)

In [12]:
eval_inputs = None
eval_targets = None
eval_decoder_inputs = None
for batch in eval_dataloader:
    if eval_inputs is None:
        eval_inputs = batch['encoder_inputs']
        eval_targets = batch['decoder_targets']
        eval_decoder_inputs = batch['decoder_inputs']
    else:
        eval_inputs = eval_inputs + batch['encoder_inputs']
        eval_targets = eval_targets + batch['decoder_targets']
        eval_decoder_inputs = eval_decoder_inputs + batch['decoder_inputs']
eval_inputs = np.array(eval_inputs)
eval_targets = np.array(eval_targets)
eval_decoder_inputs = np.array(eval_decoder_inputs)

In [13]:
train = tf.data.Dataset.from_tensor_slices((train_inputs, train_decoder_inputs, train_targets)).batch(parameters["BATCH_SIZE"], drop_remainder=False)
test = tf.data.Dataset.from_tensor_slices((test_inputs, test_decoder_inputs, test_targets)).batch(parameters["BATCH_SIZE"], drop_remainder=False)
val = tf.data.Dataset.from_tensor_slices((eval_inputs, eval_decoder_inputs, eval_targets)).batch(parameters["BATCH_SIZE"], drop_remainder=False)

In [14]:
@tf.keras.utils.register_keras_serializable()
class Embeddings(tf.keras.layers.Layer):
    def __init__(self, d_model: int, seq_len: int, voc_size: int, dropout_rate: float = 0.1, **kwargs):
        super(Embeddings, self).__init__(**kwargs)
        self.d_model = d_model
        self.seq_len = seq_len
        self.voc_size = voc_size
        self.dropout_rate = dropout_rate

        self.input_emb = tf.keras.layers.Embedding(self.voc_size, self.d_model, name='Sequence_Embedding')
        self.positional_emb = tf.keras.layers.Embedding(self.seq_len, self.d_model, name='Positional_Embedding')
        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)

    def build(self, input_shape):
        self.input_emb.build(input_shape)
        self.positional_emb.build(input_shape)
        output_shape = self.input_emb.compute_output_shape(input_shape)
        self.dropout.build(output_shape)

    def compute_output_shape(self, input_shape):
        return self.input_emb.compute_output_shape(input_shape)

    def call(self, inputs, training=False):
        # inputs -> (batch, seq_len)
        positions = tf.repeat(tf.expand_dims(tf.range(tf.shape(inputs)[1]), 0), [tf.shape(inputs)[0]], axis=0) # (batch, seq_len) 
        inp_emb = self.input_emb(inputs) # (batch, seq_len, d_model)
        pos_emb = self.positional_emb(positions) # (batch, seq_len, d_model)

        return self.dropout(inp_emb + pos_emb, training=training) # (batch, seq_len, d_model)

In [15]:
@tf.keras.utils.register_keras_serializable()
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model: int, h: int, dropout_rate: float = 0.1, **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        assert d_model % h == 0
        self.d_model = d_model
        self.h = h
        self.d_k = self.d_model // self.h
        self.dropout_rate = dropout_rate

        self.w_q = tf.keras.layers.Dense(self.d_model) 
        self.w_k = tf.keras.layers.Dense(self.d_model) 
        self.w_v = tf.keras.layers.Dense(self.d_model) 
        self.w_o = tf.keras.layers.Dense(self.d_model)

        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)

    def build(self, input_shape):
        q, k, v = input_shape
        self.w_q.build(q)
        self.w_k.build(k)
        self.w_v.build(v)
        self.w_o.build(q)
        self.dropout.build(q)

    def compute_output_shape(self, input_shape):
        q, k, v = input_shape
        return self.dropout.compute_output_shape(q), (q[0], self.h, q[1], k[1]) 
        
    def scaled_dot_product_attention(self, q, k, v, mask=None):
        attn_score = q @ tf.transpose(k, perm=[0,1,3,2]) / tf.sqrt(tf.cast(k.shape[-1], dtype=tf.float32))
        if mask is not None:
            # attn_score += (mask * -1e9)
            attn_score = tf.where(mask==0, -1e9, attn_score) # Set very small values where mask = 0
            
        attn_wts = tf.nn.softmax(attn_score, -1) # (batch, h, seq_len, seq_len) seq_len*seq_len because self attention
        outputs = attn_wts @ v # (batch, h, seq_len, d_k)
        return outputs, attn_wts

    def call(self, q, k, v, mask=None, training=False):
        q = self.w_q(q) # (batch, seq_len, d_model)
        k = self.w_k(k)
        v = self.w_v(v)

        # Convert (batch, seq_len, d_model) to (batch, h, seq_len, d_k)
        # Split d_model into h*d_k and then transpose the 2nd and 3rd dimension
        q = tf.transpose(tf.reshape(q, [tf.shape(q)[0], tf.shape(q)[1], self.h, self.d_k]), perm=[0,2,1,3])
        k = tf.transpose(tf.reshape(k, [tf.shape(k)[0], tf.shape(k)[1], self.h, self.d_k]), perm=[0,2,1,3])
        v = tf.transpose(tf.reshape(v, [tf.shape(v)[0], tf.shape(v)[1], self.h, self.d_k]), perm=[0,2,1,3])

        outputs, attn_weights = self.scaled_dot_product_attention(q, k, v, mask)

        # First Convert (batch, h, seq_len, d_k) to (batch, seq_len, d_model)
        # Reverse the above operations
        # Run through Dense to get (batch, seq_len, d_model) 
        outputs = self.w_o(tf.reshape(tf.transpose(outputs, perm=[0,2,1,3]), [tf.shape(outputs)[0], tf.shape(outputs)[2], self.d_model]))

        return self.dropout(outputs, training=training), attn_weights

In [16]:
# atn = MultiHeadAttention(d_model=100, h=2)
# enc_emb = tf.keras.random.uniform((4, 20, 100))
# dec_emb = tf.keras.random.uniform((4, 10, 100))
# atn.build([(dec_emb.shape), (enc_emb.shape), (enc_emb.shape)])
# x, y = atn(dec_emb, enc_emb, enc_emb)
# print(x.shape, y.shape)

In [17]:
@tf.keras.utils.register_keras_serializable()
class AddAndNorm(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AddAndNorm, self).__init__(**kwargs)
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def build(self, input_shape):
        self.layer_norm.build(input_shape)

    def compute_output_shape(self, input_shape):
        return self.layer_norm.compute_output_shape(input_shape)
        
    def call(self, skip_conn, output):
        return self.layer_norm(skip_conn + output) # (batch, seq_len, d_model)

In [18]:
@tf.keras.utils.register_keras_serializable()
class PositionwiseFeedForwardNetwork(tf.keras.layers.Layer):
    def __init__(self, d_model: int, d_ff: int, dropout_rate: float = 0.1, **kwargs):
        super(PositionwiseFeedForwardNetwork, self).__init__(**kwargs)
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate

        self.inner = tf.keras.layers.Dense(self.d_ff, activation='relu')
        self.outer = tf.keras.layers.Dense(self.d_model)
        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)

    def build(self, input_shape):
        self.inner.build(input_shape)
        output = self.inner.compute_output_shape(input_shape)
        self.outer.build(output)
        self.dropout.build(self.outer.compute_output_shape(output))

    def compute_output_shape(self, input_shape):
        return self.outer.compute_output_shape(self.inner.compute_output_shape(input_shape))
        
    def call(self, inputs, training=False):
        x = self.inner(inputs) # (batch, seq_len, d_ff)
        x = self.outer(x) # (batch, seq_len, d_model)
        return self.dropout(x, training=training)

In [19]:
@tf.keras.utils.register_keras_serializable()
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, d_model: int, h:int, d_ff: int, dropout_rate: float = 0.1, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        self.d_model = d_model
        self.h = h
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate
        
        self.mhsa = MultiHeadAttention(self.d_model, self.h, self.dropout_rate)
        self.add_norm1 = AddAndNorm()
        self.pffn = PositionwiseFeedForwardNetwork(self.d_model, self.d_ff, self.dropout_rate)
        self.add_norm2 = AddAndNorm()

    def build(self, input_shape):
        self.mhsa.build([input_shape, input_shape, input_shape])
        self.add_norm1.build(input_shape)
        self.pffn.build(input_shape)
        self.add_norm2.build(input_shape)

    def compute_output_shape(self, input_shape):
        return self.mhsa.compute_output_shape([input_shape, input_shape, input_shape])

    def call(self, inputs, mask=None, training=False):
        mhsa_outputs, attn_weights = self.mhsa(inputs, inputs, inputs, mask, training=training)
        x = self.add_norm1(inputs, mhsa_outputs)
        pffn_outputs = self.pffn(x, training=training)
        x = self.add_norm2(x, pffn_outputs)
        
        return x, attn_weights

In [20]:
@tf.keras.utils.register_keras_serializable()
class Encoder(tf.keras.layers.Layer):
    def __init__(self, N: int, d_model: int, seq_len: int, voc_size: int, h:int, d_ff: int, dropout_rate: float = 0.1, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        assert N > 0
        self.N = N
        self.d_model = d_model
        self.seq_len = seq_len
        self.voc_size = voc_size
        self.h = h
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate

        self.embedding = Embeddings(self.d_model, self.seq_len, self.voc_size, self.dropout_rate)
        self.enc_layers = [EncoderBlock(self.d_model, self.h, self.d_ff, self.dropout_rate) for _ in range(self.N)]

    def build(self, input_shape):
        self.embedding.build(input_shape)
        output = self.embedding.compute_output_shape(input_shape)
        for encoder in self.enc_layers:
            encoder.build(output)

    def compute_output_shape(self, input_shape):
        return self.enc_layers[0].compute_output_shape(self.embedding.compute_output_shape(input_shape))

    def call(self, inputs, mask=None, training=False):
        attn_weights = None
        x = self.embedding(inputs, training=training)
        for encoder in self.enc_layers:
            x, attn_weights = encoder(x, mask=mask, training=training) 
        return x, attn_weights

In [21]:
@tf.keras.utils.register_keras_serializable()
class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self, d_model: int, h:int, d_ff: int, dropout_rate: float = 0.1, **kwargs):
        super(DecoderBlock, self).__init__(**kwargs)
        self.d_model = d_model
        self.h = h
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate
        
        self.mhsa = MultiHeadAttention(self.d_model, self.h, self.dropout_rate)
        self.add_norm1 = AddAndNorm()
        self.mhca = MultiHeadAttention(self.d_model, self.h, self.dropout_rate)
        self.add_norm2 = AddAndNorm()
        self.pffn = PositionwiseFeedForwardNetwork(self.d_model, self.d_ff, self.dropout_rate)
        self.add_norm3 = AddAndNorm()

    def build(self, input_shape):
        dec_input_shape, enc_output_shape = input_shape
        self.mhsa.build([dec_input_shape, dec_input_shape, dec_input_shape])
        self.add_norm1.build(dec_input_shape)
        self.mhca.build([dec_input_shape, enc_output_shape, enc_output_shape])
        self.add_norm2.build(dec_input_shape)
        self.pffn.build(dec_input_shape)
        self.add_norm3.build(dec_input_shape)

    def compute_output_shape(self, input_shape):
        dec_input_shape, enc_output_shape = input_shape
        return self.mhca.compute_output_shape([dec_input_shape, enc_output_shape, enc_output_shape])

    def call(self, inputs, encoder_outputs, decoder_mask=None, encoder_mask=None, training=False):
        mhsa_outputs, _ = self.mhsa(inputs, inputs, inputs, mask=decoder_mask, training=training)
        x = self.add_norm1(inputs, mhsa_outputs)
        mhca_outputs, attn_weights = self.mhca(x, encoder_outputs, encoder_outputs, mask=encoder_mask, training=training)
        x = self.add_norm2(x, mhca_outputs)
        pffn_outputs = self.pffn(x, training=training)
        x = self.add_norm3(x, pffn_outputs)

        return x, attn_weights

In [22]:
@tf.keras.utils.register_keras_serializable()
class Decoder(tf.keras.layers.Layer):
    def __init__(self, N: int, d_model: int, seq_len: int, voc_size: int, h:int, d_ff: int, dropout_rate: float = 0.1, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        assert N > 0
        self.N = N
        self.d_model = d_model
        self.seq_len = seq_len
        self.voc_size = voc_size
        self.h = h
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate

        self.embedding = Embeddings(self.d_model, self.seq_len, self.voc_size, self.dropout_rate)
        self.dec_layers = [DecoderBlock(self.d_model, self.h, self.d_ff, self.dropout_rate) for _ in range(self.N)]

    def build(self, input_shape):
        dec_input_shape, enc_output_shape = input_shape
        self.embedding.build(dec_input_shape)
        output = self.embedding.compute_output_shape(dec_input_shape)
        for decoder in self.dec_layers:
            decoder.build([output, enc_output_shape])

    def compute_output_shape(self, input_shape):
        dec_input_shape, enc_output_shape = input_shape
        return self.dec_layers[0].compute_output_shape([self.embedding.compute_output_shape(dec_input_shape), enc_output_shape])

    def call(self, inputs, encoder_outputs, decoder_mask=None, encoder_mask=None, training=False):
        attn_weights = None
        x = self.embedding(inputs, training=training)
        for decoder in self.dec_layers:
            x, attn_weights = decoder(x, encoder_outputs, decoder_mask=decoder_mask, encoder_mask=encoder_mask, training=training) 
        return x, attn_weights

In [23]:
@tf.keras.utils.register_keras_serializable()
class Transformer(tf.keras.layers.Layer):
    def __init__(
        self, 
        encoder_layers: int, 
        decoder_layers: int,
        d_model: int, 
        encoder_seq_len: int, 
        decoder_seq_len: int, 
        encoder_voc_size: int, 
        decoder_voc_size: int, 
        encoder_attention_heads: int, 
        decoder_attention_heads: int, 
        encoder_ffn_dim: int, 
        decoder_ffn_dim: int, 
        dropout: float = 0.1, 
        **kwargs
    ):
        super(Transformer, self).__init__(**kwargs)
        
        self.encoder_layers = encoder_layers
        self.decoder_layers = decoder_layers
        assert self.encoder_layers > 0 and self.decoder_layers > 0, "Encoder and Decoder must have atleast 1 layer"
        
        self.d_model = d_model
        self.encoder_seq_len = encoder_seq_len
        self.decoder_seq_len = decoder_seq_len
        self.encoder_voc_size = encoder_voc_size
        self.decoder_voc_size = decoder_voc_size
        self.encoder_attention_heads = encoder_attention_heads
        self.decoder_attention_heads = decoder_attention_heads
        self.encoder_ffn_dim = encoder_ffn_dim
        self.decoder_ffn_dim = decoder_ffn_dim
        self.dropout = dropout

        self.encoder = Encoder(self.encoder_layers, self.d_model, self.encoder_seq_len, self.encoder_voc_size, self.encoder_attention_heads, self.encoder_ffn_dim, self.dropout)
        self.decoder = Decoder(self.decoder_layers, self.d_model, self.decoder_seq_len, self.decoder_voc_size, self.decoder_attention_heads, self.decoder_ffn_dim, self.dropout)
        self.projection = tf.keras.layers.Dense(decoder_voc_size)

    def build(self, input_shape):
        enc_input_shape, dec_input_shape = input_shape
        self.encoder.build(enc_input_shape)
        enc_output, _ = self.encoder.compute_output_shape(enc_input_shape)
        self.decoder.build([dec_input_shape, enc_output])
        dec_output, _ = self.decoder.compute_output_shape([dec_input_shape, enc_output])
        self.projection.build(dec_output)

    def compute_output_shape(self, input_shape):
        enc_input_shape, dec_input_shape = input_shape
        enc_output, enc_attn = self.encoder.compute_output_shape(enc_input_shape)
        dec_output, dec_attn = self.decoder.compute_output_shape([dec_input_shape, enc_output])
        return self.projection.compute_output_shape(dec_output), enc_attn, dec_attn

    def call(self, encoder_inputs, decoder_inputs, encoder_mask=None, decoder_mask=None, training=False):
        enc_outputs, enc_attn_wts = self.encoder(encoder_inputs, mask=encoder_mask, training=training)
        dec_outputs, dec_attn_wts = self.decoder(decoder_inputs, enc_outputs, encoder_mask=encoder_mask, decoder_mask=decoder_mask, training=training)
        logits = self.projection(dec_outputs)

        return logits, enc_attn_wts, dec_attn_wts

In [24]:
# batch, enc_layer, dec_layer, d_model = 2, 2, 2, 15
# enc_seq_len, dec_seq_len, enc_voc_size, dec_voc_size = 13, 9, 70, 35
# enc_attn_heads, dec_attn_heads = 5, 5
# enc_ffn, dec_ffn = 16, 16
# tformer = Transformer(enc_layer, dec_layer, d_model, enc_seq_len, dec_seq_len, enc_voc_size, dec_voc_size, enc_attn_heads, dec_attn_heads, enc_ffn, dec_ffn)
# enc_inp = tf.keras.random.randint((batch, enc_seq_len), 0, enc_voc_size)
# dec_inp = tf.keras.random.randint((batch, dec_seq_len), 0, dec_voc_size)
# tformer.build([enc_inp.shape, dec_inp.shape])
# res, enc_attn, dec_attn = tformer(enc_inp, dec_inp)
# print(res.shape, enc_attn.shape, dec_attn.shape)

In [25]:
@tf.keras.utils.register_keras_serializable()
def cross_entropy_loss(targets, output_dist, mask=None, label_smoothing=0.1):
    targets = tf.keras.utils.to_categorical(targets, num_classes=output_dist.shape[-1])
    scce_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, reduction=None, label_smoothing=label_smoothing)
    step_scce_loss = scce_loss(targets, output_dist)
    if mask is not None:
        step_scce_loss = tf.reduce_mean(tf.reduce_sum(step_scce_loss*mask, 1) / tf.reduce_sum(mask, 1))
    return step_scce_loss

In [None]:
@tf.keras.utils.register_keras_serializable()
class TransformerTrainer(tf.keras.Model):
    def __init__(self, transformer: Transformer, label_smoothing: float = 0.1, **kwargs):
        super(TransformerTrainer, self).__init__(**kwargs)
        self.transformer = transformer
        self.label_smoothing = label_smoothing
        self.loss_tracker = tf.keras.metrics.Mean(name="loss")
        self.val_loss_tracker = tf.keras.metrics.Mean(name="val_loss")

    def build(self, input_shape):
        enc_input, dec_input, dec_output = input_shape
        self.transformer.build([enc_input, dec_input])

    def compute_output_shape(self, input_shape):
        enc_input, dec_input, dec_output = input_shape
        return self.transformer.compute_output_shape([enc_input, dec_input])

    def compute_padding_mask(self, inp):
        mask = tf.cast(tf.math.not_equal(inp, 0), tf.float32)
        mask = mask[:, tf.newaxis, tf.newaxis, :]
        return mask
    
    def compute_padding_lookahead_mask(self, decoder_inp):
        mask = tf.cast(tf.math.equal(decoder_inp, 0), tf.float32)
        mask = mask[:, tf.newaxis, tf.newaxis, :]
        return tf.cast(tf.maximum(mask, 1 - tf.linalg.band_part(tf.ones((decoder_inp.shape[-1], decoder_inp.shape[-1])), -1, 0)) == 0, tf.float32)

    def call(self, inputs):
        encoder_inputs, decoder_inputs, targets = inputs
        encoder_mask = self.compute_padding_mask(encoder_inputs)
        decoder_mask = self.compute_padding_lookahead_mask(decoder_inputs)

        return self.transformer(encoder_inputs, decoder_inputs, encoder_mask, decoder_mask)

    @tf.function
    def train_step(self, inputs):
        encoder_inputs, decoder_inputs, targets = inputs

        loss = None
        encoder_mask = self.compute_padding_mask(encoder_inputs)
        decoder_mask = self.compute_padding_lookahead_mask(decoder_inputs)

        with tf.GradientTape() as tape:
            logits, _, _ = self.transformer(encoder_inputs, decoder_inputs, encoder_mask, decoder_mask, training=True)
            loss = self.loss(targets, logits, tf.cast(tf.math.not_equal(targets, 0), tf.float32), self.label_smoothing)

        gradients = tape.gradient(loss, self.transformer.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.transformer.trainable_variables))

        self.loss_tracker.update_state(loss)

        return {'loss': self.loss_tracker.result()}

    @tf.function
    def test_step(self, inputs):
        encoder_inputs, decoder_inputs, targets = inputs
        encoder_mask = self.compute_padding_mask(encoder_inputs)
        decoder_mask = self.compute_padding_lookahead_mask(decoder_inputs)
        
        logits, _, _ = self.transformer(encoder_inputs, decoder_inputs, encoder_mask, decoder_mask, training=False)
        loss = self.loss(targets, logits, mask=tf.cast(tf.math.not_equal(targets, 0), tf.float32), label_smoothing=0.0)

        self.val_loss_tracker.update_state(loss)

        return {'loss': self.val_loss_tracker.result()}

In [27]:
tf.config.run_functions_eagerly(False)

In [28]:
tformer = Transformer(parameters['ENCODER_LAYERS'], parameters['DECODER_LAYERS'], parameters['EMBEDDING_DIMENSION'], parameters['ENCODER_SEQUENCE_LENGTH'], parameters['DECODER_SEQUENCE_LENGTH'], parameters['VOC_SIZE'], parameters['VOC_SIZE'], parameters['ENCODER_ATTENTION_HEADS'], parameters['DECODER_ATTENTION_HEADS'], parameters['ENCODER_FFN_DIM'], parameters['DECODER_FFN_DIM'])
model = TransformerTrainer(tformer, parameters['LABEL_SMOOTHING'])
model.build(((None, parameters['ENCODER_SEQUENCE_LENGTH']), (None, parameters['DECODER_SEQUENCE_LENGTH']), (None, parameters['DECODER_SEQUENCE_LENGTH'])))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=parameters['LEARNING_RATE'], weight_decay=parameters['L2_REG']), loss=cross_entropy_loss, run_eagerly=True)

In [29]:
model.summary()

In [30]:
class GenerationCallback(tf.keras.callbacks.Callback):
    def __init__(self, df:pd.DataFrame, encoder_seq_len:int, decoder_seq_len: int, tokenizer, sos_id:int, eos_id:int, pad_id:int=0, **kwargs):
        self.df = df
        self.encoder_seq_len = encoder_seq_len
        self.decoder_seq_len = decoder_seq_len
        self.tokenizer = tokenizer
        self.pad_id = pad_id
        self.sos_id = sos_id
        self.eos_id = eos_id

    def generate(self):
        generated = []
        for news, title in self.df.sample(n=2).values.tolist():
            news_encoded = self.tokenizer.encode(news)
        
            if len(news_encoded) >= self.encoder_seq_len:
                news_encoded = news_encoded[:self.encoder_seq_len]
            else:
                news_encoded = news_encoded + [self.pad_id] * (self.encoder_seq_len - len(news_encoded))
        
            encoder_mask = tf.cast(tf.math.not_equal([news_encoded], 0), tf.float32)
            encoder_mask = encoder_mask[:, tf.newaxis, tf.newaxis, :]
        
            enc_outputs, enc_attn_wts = self.model.transformer.encoder(tf.convert_to_tensor([news_encoded]), mask=encoder_mask, training=False)
            
            decoder_input = tf.fill([1,1], self.sos_id)
        
            for t in range(self.decoder_seq_len):
                decoder_mask = tf.cast(tf.math.equal(decoder_input, 0), tf.float32)
                decoder_mask = decoder_mask[:, tf.newaxis, tf.newaxis, :]
                decoder_mask = tf.cast(tf.maximum(decoder_mask, 1 - tf.linalg.band_part(tf.ones((decoder_input.shape[-1], decoder_input.shape[-1])), -1, 0)) == 0, tf.float32)
        
                dec_outputs, dec_attn_wts = self.model.transformer.decoder(decoder_input, enc_outputs, encoder_mask=encoder_mask, decoder_mask=decoder_mask, training=False)
                final_dist = self.model.transformer.projection(dec_outputs[:,-1])
                curr_output = tf.expand_dims(tf.argmax(final_dist, -1, output_type=tf.int32), 1)
                if curr_output[0] == self.eos_id:
                    break
                decoder_input = tf.concat([decoder_input, curr_output], -1)
            generated.append([title, self.tokenizer.decode(tf.squeeze(decoder_input, 0).numpy().tolist())])
        return generated
    # def on_epoch_begin(self, epoch, logs=None):
        ## 1 for first epoch(0)
        # self.model.teacher_forcing_ratio = 0.9**epoch
        
    def on_epoch_end(self, epoch, logs=None):
        generated_titles = self.generate()
        print("\n")
        for ref, pred in generated_titles:
            print(f"REF: {ref}, PRED: {pred}")

In [31]:
cpk = keras.callbacks.ModelCheckpoint(
    filepath='/kaggle/working/best.weights.h5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
)
es = keras.callbacks.EarlyStopping(monitor='val_loss', mode="min", patience=parameters['EARLY_STOPPING'])
gc = GenerationCallback(df_eval.sample(n=100), parameters['ENCODER_SEQUENCE_LENGTH'], parameters['DECODER_SEQUENCE_LENGTH'], sp, parameters['SOS_TOKEN_ID'], parameters['EOS_TOKEN_ID'], parameters['PAD_TOKEN_ID'])

In [32]:
history = model.fit(train, epochs=parameters["EPOCHS"], batch_size=parameters["BATCH_SIZE"], validation_data=test, callbacks=[cpk, es, gc])

Epoch 1/20
[1m4075/4075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391ms/step - loss: 8.4677

REF: खेलकुदको शहर खेलकुदमै टुहुरो, PRED: राष्ट्रिय प्रदेश प्रदेश १ मा
REF: आर्टिफिसियल इन्टेलिजेन्स अन्तर्राष्ट्रिय कार्यशाला शुरु, PRED: नेपाल बैंकको लगानी
[1m4075/4075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1705s[0m 408ms/step - loss: 8.4676 - val_loss: 6.6108
Epoch 2/20
[1m4075/4075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 385ms/step - loss: 6.9683

REF: पुटिनले अचानक युक्रेन विरुद्ध अर्को निर्णय लिए, PRED: अमेरिकी अमेरिकी हतियार प्रयोग गर्न प्रतिबन्ध
REF: बलिउडकै महंगो विवाह रणवीर र दीपिकाले यस्तो ठाउँ रोजे, PRED: सलमान खान को पहिलो पटक
[1m4075/4075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1634s[0m 401ms/step - loss: 6.9682 - val_loss: 5.7458
Epoch 3/20
[1m4075/4075[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 385ms/step - loss: 6.2738

REF: कांग्रेस भित्र यस्ता छन् गुट र उपगुटहरू, PRED: कांग्रेस केन्द्रीय कमिटी बैठक बस्दै
REF: प्रभुराम

In [33]:
model.save_weights('/kaggle/working/model.weights.h5')
with open('/train_history', 'wb') as f:
    pickle.dump(history.history, f)

In [34]:
with open('/train_history', "rb") as f:
    history = pickle.load(f)
history

{'loss': [7.934503078460693,
  6.765522003173828,
  6.139360427856445,
  5.673072338104248,
  5.30250883102417,
  4.989997863769531,
  4.715693950653076,
  4.471354961395264,
  4.250297546386719,
  4.050029277801514,
  3.8695437908172607,
  3.702496290206909,
  3.5523335933685303,
  3.4147582054138184],
 'val_loss': [6.610799789428711,
  5.745779037475586,
  5.2190260887146,
  4.859400272369385,
  4.612404823303223,
  4.438576698303223,
  4.330516815185547,
  4.270757675170898,
  4.221330165863037,
  4.211203575134277,
  4.214169025421143,
  4.232534885406494,
  4.275265693664551,
  4.3260369300842285]}

In [35]:
tformer_trained = Transformer(parameters['ENCODER_LAYERS'], parameters['DECODER_LAYERS'], parameters['EMBEDDING_DIMENSION'], parameters['ENCODER_SEQUENCE_LENGTH'], parameters['DECODER_SEQUENCE_LENGTH'], parameters['VOC_SIZE'], parameters['VOC_SIZE'], parameters['ENCODER_ATTENTION_HEADS'], parameters['DECODER_ATTENTION_HEADS'], parameters['ENCODER_FFN_DIM'], parameters['DECODER_FFN_DIM'])
model_trained = TransformerTrainer(tformer_trained, parameters['LABEL_SMOOTHING'])
model_trained.build(((None, parameters['ENCODER_SEQUENCE_LENGTH']), (None, parameters['DECODER_SEQUENCE_LENGTH']), (None, parameters['DECODER_SEQUENCE_LENGTH'])))
model_trained.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=parameters['LEARNING_RATE'], weight_decay=parameters['L2_REG']), loss=cross_entropy_loss, run_eagerly=False)

In [36]:
model_trained.load_weights('/kaggle/working/model.weights.h5')

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
def generate(model, news, encoder_seq_len, decoder_seq_len, tokenizer, sos_id, eos_id, pad_id):
    news_encoded = tokenizer.encode(news)
    
    if len(news_encoded) >= encoder_seq_len:
        news_encoded = news_encoded[:encoder_seq_len]
    else:
        news_encoded = news_encoded + [pad_id] * (encoder_seq_len - len(news_encoded))

    output_seq = []

    encoder_mask = tf.cast(tf.math.not_equal([news_encoded], 0), tf.float32)
    encoder_mask = encoder_mask[:, tf.newaxis, tf.newaxis, :]

    enc_outputs, enc_attn_wts = model.transformer.encoder(tf.convert_to_tensor([news_encoded]), mask=encoder_mask, training=False)
    
    decoder_input = tf.fill([1,1], sos_id)

    for t in range(decoder_seq_len):
        decoder_mask = tf.cast(tf.math.equal(decoder_input, 0), tf.float32)
        decoder_mask = decoder_mask[:, tf.newaxis, tf.newaxis, :]
        decoder_mask = tf.cast(tf.maximum(decoder_mask, 1 - tf.linalg.band_part(tf.ones((decoder_input.shape[-1], decoder_input.shape[-1])), -1, 0)) == 0, tf.float32)

        dec_outputs, dec_attn_wts = model.transformer.decoder(decoder_input, enc_outputs, encoder_mask=encoder_mask, decoder_mask=decoder_mask, training=False)
        final_dist = model.transformer.projection(dec_outputs[:,-1])
        curr_output = tf.expand_dims(tf.argmax(final_dist, -1, output_type=tf.int32), 1)
        if curr_output[0] == eos_id:
            break
        decoder_input = tf.concat([decoder_input, curr_output], -1)
    return tokenizer.decode(tf.squeeze(decoder_input, 0).numpy().tolist())

In [38]:
generate(model_trained, df_eval['news'].iloc[1], 256, 12, sp, sp.bos_id(), sp.eos_id(), sp.pad_id())

''