In [8]:
import numpy as np

In [9]:
def gelu(x):
    return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))

In [10]:
def scaled_dot_product_attention(query, key, value, mask=None):
    matmul_qk = np.dot(query, key.T)
    d_k = key.shape[-1]
    scaled_attention_logits = matmul_qk / np.sqrt(d_k)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    attention_weights = softmax(scaled_attention_logits, axis=-1)
    output = np.dot(attention_weights, value)

    return output, attention_weights


In [11]:
def softmax(x, axis=-1):
    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return e_x / e_x.sum(axis=axis, keepdims=True)

In [12]:
class PositionalEncoding:
    def __init__(self, d_model, max_seq_len=512):
        self.encoding = self.generate_positional_encoding(d_model, max_seq_len)

    def generate_positional_encoding(self, d_model, max_seq_len):
        position = np.arange(max_seq_len).reshape(-1, 1)
        div_term = 1 / np.power(10000, 2 * (np.arange(d_model) // 2) / d_model)
        encoding = np.zeros((max_seq_len, d_model))

        encoding[:, 0::2] = np.sin(position * div_term[0::2])
        encoding[:, 1::2] = np.cos(position * div_term[1::2])

        return encoding

    def get_positional_encoding(self, seq_len):
        return self.encoding[:seq_len]


In [13]:
class TransformerEncoder:
    def __init__(self, num_layers, d_model, num_heads, dff, max_seq_len):
        self.num_layers = num_layers
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.max_seq_len = max_seq_len

        self.pos_encoding = PositionalEncoding(d_model, max_seq_len)
        self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, dff)
        self.layers = self.create_encoder_layers()

    def create_encoder_layers(self):
        layers = []
        for _ in range(self.num_layers):
            layers.append(self.create_encoder_layer())
        return layers

    def create_encoder_layer(self):
        return lambda x, mask: self.encoder_layer(x, mask)

    def encoder_layer(self, x, mask):
        x += self.pos_encoding.get_positional_encoding(x.shape[1])
        x, _ = self.multi_head_attention(x, x, x, mask)
        x = self.feed_forward(x)
        return x