In [None]:
!pip install transformers

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
!pip install sentencepiece

In [None]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert') 
model = AutoModel.from_pretrained('ai4bharat/indic-bert')

In [None]:
!pip install gdown

In [None]:
!gdown https://drive.google.com/uc?id=1-0wClU9sEjQs1O2wLw0KnN9haAVrZYOd

In [None]:
!gdown https://drive.google.com/uc?id=1-3K-X4Up3FlNiQHDMn0GzhDAGXQDEtUU

In [None]:
!gdown https://drive.google.com/uc?id=1PKpgGgEdowuzFNL41Jf6qd48ZFe4e-yI

In [None]:
!gdown https://drive.google.com/uc?id=1owksQpJA1-BE1xLD1Ewq4Ui-pbVm8WVI

In [None]:
!gdown https://drive.google.com/uc?id=1w-icHfAmAQBVM8YzzNOFJUU_ZZyJSRx5

In [None]:
!unzip ./2epoch.zip

In [None]:
gu_data = []
f = open('./train.gu')
for l in f:
    gu_data.append(l)

In [None]:
train_pairs = []
f = open('./train.en')
i = 0
for l in f:
    train_pairs.append((gu_data[i][:-1],l[:-1]))
    i += 1

In [None]:
gu_test = []
f = open('./test.gu')
for l in f:
    gu_test.append(l)

In [None]:
test_pairs = []
f = open('./test.en')
i = 0
for l in f:
    test_pairs.append((gu_test[i][:-1],l[:-1]))
    i += 1

In [None]:
vocab_size = 200000
sequence_length = 20
batch_size = 64

In [None]:
def make_dataset(pairs):
    
    def gen():
        for gu,eng in pairs:

            eng = tokenizer.encode_plus(eng, add_special_tokens=True, max_length=sequence_length, return_token_type_ids=True,
                                        return_attention_mask=True, pad_to_max_length=True, truncation=True)
            
            gu = tokenizer.encode_plus(gu, add_special_tokens=True, max_length=sequence_length+1, return_token_type_ids=True,
                                        return_attention_mask=True, pad_to_max_length=True, truncation=True)
            
            inp, out = ({"encoder_inputs": eng["input_ids"], "encoder_mask": eng["attention_mask"],
                    "decoder_inputs": gu["input_ids"][:-1], "decoder_mask": gu["attention_mask"][:-1] }, gu["input_ids"][1:])
            
            yield (inp,out)
        
    return tf.data.Dataset.from_generator( gen,
        ({"encoder_inputs": tf.int32, "encoder_mask": tf.int32, "decoder_inputs": tf.int32, "decoder_mask": tf.int32}, tf.int32),
        (
            {
                "encoder_inputs": tf.TensorShape([None]),
                "encoder_mask": tf.TensorShape([None]),
                "decoder_inputs": tf.TensorShape([None]),
                "decoder_mask": tf.TensorShape([None])
            },
            tf.TensorShape([None]),
        ),
    )


train_ds = make_dataset(train_pairs)

In [None]:
train_ds = train_ds.shuffle(2048).batch(batch_size)
test_ds = make_dataset(test_pairs).shuffle(2048).batch(batch_size)

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim//num_heads#, kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.02)
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim),#, kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.02), activation="relu"),
             layers.Dense(embed_dim)]#, kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.02)),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim#, embeddings_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.02)
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim#, embeddings_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.02)
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim//num_heads#, kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.2)
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim//num_heads#, kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.2)
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), #kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.2)),
             layers.Dense(embed_dim)]#, kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.2)),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)


In [None]:
embed_dim = 512
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
encoder_mask = keras.Input(shape=(None,), dtype="int64", name="encoder_mask")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)

x = TransformerEncoder(embed_dim, latent_dim, num_heads)(x,mask=encoder_mask)
x = TransformerEncoder(embed_dim, latent_dim, num_heads)(x,mask=encoder_mask)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x,mask=encoder_mask)    #3 layers

encoder = keras.Model([encoder_inputs, encoder_mask], encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
decoder_mask = keras.Input(shape=(None,), dtype="int64", name="decoder_mask")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)

x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs, mask=decoder_mask)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs, mask=decoder_mask)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs, mask=decoder_mask)   #3 layers

x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size,# kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.2),
                              activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs, decoder_mask], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs, decoder_mask])
transformer = keras.Model(
    [encoder_inputs, encoder_mask, decoder_inputs, decoder_mask], decoder_outputs, name="transformer"
)

In [None]:
epochs = 1

transformer.summary()
transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4, epsilon=1e-9, clipnorm=1),
    loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

In [None]:
 transformer.fit(train_ds, epochs=epochs)

In [None]:
import random
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from tqdm import tqdm
def decode_sequence(input_sentence):
    tokenized_input_sentence = tokenizer.encode_plus(input_sentence, add_special_tokens=True, max_length=sequence_length, return_token_type_ids=True,
                                        return_attention_mask=True, pad_to_max_length=True, truncation=True, return_tensors="tf")
    decoded_sentence = "[CLS]"
    tokens = [2]
    for i in range(20):
        tokenized_target_sentence = tokenizer.encode_plus(decoded_sentence, add_special_tokens=False, max_length=sequence_length, return_token_type_ids=True,
                                        return_attention_mask=True, pad_to_max_length=True, truncation=True, return_tensors="tf")
        u = [tokenized_input_sentence["input_ids"], tokenized_input_sentence['attention_mask'],
                                   tokenized_target_sentence["input_ids"], tokenized_target_sentence["attention_mask"]  ]
        predictions = transformer(u)

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = tokenizer.decode([sampled_token_index])
        decoded_sentence += " " + sampled_token
        tokens.append(sampled_token_index)

        if sampled_token == "[SEP]":
            break
    return tokenizer.decode(tokens)


bleu1 = []
bleu2 = []
bleu3 = []
bleu4 = []
preds = []

for i in tqdm(range(len(test_pairs))):
    target, input_sentence = test_pairs[i]
    translated = decode_sequence(input_sentence)
    ac = [target.split()]
    pr = translated[5:-5].split()
    preds.append(pr)
    b1 = sentence_bleu(ac, pr,weights=(1,), smoothing_function=SmoothingFunction().method1)
    bleu1.append(b1)
    b2 = sentence_bleu(ac, pr,weights=(0.5,0.5), smoothing_function=SmoothingFunction().method1)
    bleu2.append(b2)
    b3 = sentence_bleu(ac, pr,weights=(0.33,0.33,0.33), smoothing_function=SmoothingFunction().method1)
    bleu3.append(b3)
    b4 = sentence_bleu(ac, pr,weights=(0.25,0.25,0.25,0.25), smoothing_function=SmoothingFunction().method1)
    bleu4.append(b4)

#     print(input_sentence)
#     print(translated)
#     print(target)
print(np.mean(bleu1))
print(np.mean(bleu2))
print(np.mean(bleu3))
print(np.mean(bleu4))

In [None]:
transformer.save('./2epoch')

In [None]:
transformer.load_weights('./2epoch')

In [None]:
!zip -r ./2epoch.zip ./2epoch