## BERT (Bidirectional Encoder Representations)

## Implementation

### Transformer Encoder

In [5]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

In [14]:
# 1. Positional Encoding

class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(max_len, d_model)
    
    def get_config(self):
        config = super(PositionalEncoding, self).get_config()
        return config
    
    def positional_encoding(self, position, d_model):
        angles = np.arange(position)[:, np.newaxis] / np.power(10000, (2*(np.arange(d_model)[np.newaxis,:]//2))/ np.float32(d_model)) 
        pos_encoding[:, 0::2]=np.sin(angles[:,0::2])
        pos_encoding[:, 0::2] = np.cos(angles[:,1::2])
        pos_encoding = pos_encoding[np.newxaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)
    
    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]
    
    
    class MultiHeadAttention(layers.Layer):
        def __init__(self, d_model, num_heads):
            super(MultiHeadAttention, self).__init__()
            assert d_model % num_heads == 0
            self.num_heads = num_heads
            self.d_model = d_model
            self.depth = d_model // num_heads
            
            self.wq = layers.Dense(d_model)
            self.wk = layers.Dense(d_model)
            self.wv = layers.Dense(d_model)
            
            self.dense - layers.Dense(d_model)
            
            
        def get_config(self):
            config = super(MultiHeadAttention, self).get_config()
            return config
        
        def split_heads(self, x, batch_size):
            x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
            return tf.transpose(x, perm=[0,2,1,3])
        def call(self, v, k, q, mask=None):
            batch_size = tf.shape(q)[0]
            
            q = self.wq(q)
            k = self.wk(k)
            v = self.wv(v)
            
            q = self.split_heads(q, batch_size)
            k = self.split_heads(k, batch_size)
            v = self.split_heads(v, batch_size)
            
            
            matmul_qk = tf.matmul(q,k,transpose_b=True)
            dk = tf.cast(tf.shape(k)[-1], tf.float32)
            scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

            if mask is not None:
                mask = tf.expand_dim(mask, axis=1)
                scaled_attention_logits += (mask * -1e9)

            attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
            output = tf.matmul(attention_weights, v)


            output = tf.transpose(output, perm=[0,2,1,3])
            concat_attention = tf.reshape(output, (batch_size, -1, self.d_model))

            return self.dense(concat_attention)
    
    def feed_forward_network(d_model, d_ff):
        return tf.keras.Sequential([
            layers.Dense(d_ff, activation='relu'),
            layers.Dense(d_model)
        ])
        
    class TransformerEncoderLayer(layers.Layer):
        def __init__(self, d_model, num_heads, d__ff, dropout_rate=0.1):
            super(TransformerEncoderLayer, self).__init__()
            
            self.mha = MultiHeadAttention(d_model, num_heads)
            self.ffn = feed_forward_network(d_model, d_ff)
            
            self.layernorma1 = layers.LayerNormalization(epsilon=1e-6)
            self.layernorma2 = layers.LayerNormalization(epsilon=1e-6)
            
            self.dropout1 = layers.Dropout(dropout_rate)
            self.dropout2 = lauers.Dropout(dropout_rate)
            
        def call(self, x, mask=None, training=False):
            attn_output = self.mha(x, x, x, mask)
            att_output = self.dropout1(attn_output, training=raining)
            out1 = self.layernorm1(x + attn_output)
            
            ffn_output = self.ffn(out1)
            ffn_output = self.dropout2(ffn_output, training=training)
            return self.layernorma2(out1 + ffn_output)     

### Transformer Decoder

In [8]:
class TransformerDecoderLayer(layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
        super(TransformerDecoderLayer, self).__init__()
        
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = feed_forward_network(d_model, d_ff)
        
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        
        
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)
        self.dropout3 = layers.Dropout(dropout_rate)
        
        
    def call(self, x, enc_output, look_ahead_mask=None, padding_mask = None, training = False):
        attn1, attn_weights_block1 = self.mha1(x,x,x, look_ahead_mask)
        attn2 = self.dropout1(attn1, training=training)
        out = self.layernorm2(attn2 + out1)
        
        
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layer
        
        return out3, attn_weights_block1, attn_weights_block2

### MLM and NSP

In [15]:
class BERTWithMLMNSP(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, d_ff, vocab_size, max_len, dropout_rate=0.1):
        super(BERTWithMLMNSP, self).__init__()
        
        self.token_embeddings = layers.Embedding(vocab_size, d_model)
        self.position_encoding = PositionalEncoding(max_len, d_model)
        self.segment_embeddding = layers.Embedding(2, d_model)
        
        self.encoder_layers = [TransformerEncoderLayer(d_model, num_heads, d_ff, dropout_rate) for _ in range(num_layers)]
        self.dropout = layers.Dropout(dropout_rate)
        
        self.mlm_layer = layers.Dense(vocab_size)
        self.nsp_layer = layers.Dense(2)
        
    def call(self, input_ids, segment_ids, mask=None, training=False):
        seq_len = tf.shape(input_ids)[1]
        x = self.token_embeddings(input_ids) + self.segment_embeddings(segment_ids)
        x = self.position_encoding(x)
        x = self.dropout(x, taining=training)
        
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x, mask=mask, training=training)
            
        mlm_logits = self.mlm_layer(x)
        
        cls_representation = x[:, 0, :]
        nsp_logits = self.nsp_layer(cls_reprentation)
        return mlm_logits, nsp_logits

### Model Training

In [18]:
vocab_size = 30522
max_len = 512
batch_size = 2

input_ids = tf.random.uniform((batch_size, max_len), minval=0, maxval=vocab_size, dtype=tf.int32)
segment_ids = tf.random.uniform((batch_size, max_len), minval=0, maxval=2, dtype=tf.int32)

bert_mlm_nsp = BERTWithMLMNSP(num_layers=12, d_model=768, num_heads=12, d_ff=3072, vocab_size=vocab_size, max_len=max_len)


mlm_logits, nsp_logits = bert_mlm_nsp(input_ids=input_ids, segment_ids=segment_ids, training=True)


print("MLM logits shape:", mlm_logits.shape)
print("NSP logits shape:", nsp_logits.shape)

UnboundLocalError: cannot access local variable 'pos_encoding' where it is not associated with a value