In [3]:
import tensorflow as tf
import numpy as np

In [4]:
# 4 dimensions for each word
import math
def positionalEncodings(sequence):
    
    PE=[]
    d_model=4
    for pos in range(sequence):
        EncodingsOfEach=[]
        for i in range(d_model):
            if i%2==0:
                temp=pos/(math.pow(10000,(2*i)/d_model))
                EncodingsOfEach.append(math.sin(temp))
            else:
                temp=pos/(math.pow(10000,(2*(i-1))/d_model))
                EncodingsOfEach.append(math.cos(temp))
        PE.append(EncodingsOfEach)
    return PE

In [5]:
def selfAttention(embeddings, W_q, W_k, W_v):
    numberOfWords = tf.shape(embeddings)[0]

    Q, K, V = [], [], []
    for i in range(numberOfWords):
        x_i = tf.expand_dims(embeddings[i], 0)
        Q.append(tf.matmul(x_i, W_q))
        K.append(tf.matmul(x_i, W_k))
        V.append(tf.matmul(x_i, W_v))

    d_k = tf.math.sqrt(tf.cast(tf.shape(K[0])[1], tf.float32))

    attention_outputs = []

    for i in range(numberOfWords):
        score_list = []
        for j in range(numberOfWords):
            score = tf.matmul(Q[i], K[j], transpose_b=True)[0, 0] / d_k
            score_list.append(score)

        scores = tf.stack(score_list)
        weights = tf.nn.softmax(scores)

        output = tf.zeros_like(V[0])
        for j in range(numberOfWords):
            output += weights[j] * V[j]

        attention_outputs.append(output[0])

    return tf.stack(attention_outputs)


In [6]:
def Add_and_normalize(output_of_previous,input_of_previous):
    layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    return layer_norm(output_of_previous + input_of_previous)

In [7]:
def FeedForwardNN(x, W1, b1, W2, b2):
    hidden = tf.nn.relu(tf.matmul(x, W1) + b1)
    out = tf.matmul(hidden, W2) + b2
    return out


In [8]:
def Encoder(positional_encodings_plus_embeddings,W_q,W_k,W_v,W1,b1,W2,b2):
    self_attn_output=selfAttention(positional_encodings_plus_embeddings,W_q,W_k,W_v)
    add_and_normalize_1_output=Add_and_normalize(self_attn_output,positional_encodings_plus_embeddings)
    ffn_output=FeedForwardNN(add_and_normalize_1_output,W1,b1,W2,b2)
    encoder_output=Add_and_normalize(ffn_output,add_and_normalize_1_output)
    return encoder_output

In [9]:
embeddings=tf.constant([[1,0,1,0],[0,1,0,1],[1,1,1,1]],dtype=tf.float32)
positional_encodings=positionalEncodings(len(embeddings))
embeddings_plus_PE=embeddings+positional_encodings

In [10]:
d_model=embeddings_plus_PE[0].shape[0]
W_q=tf.constant([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]],dtype=tf.float32)
W_k=tf.constant([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]],dtype=tf.float32)
W_v=tf.constant([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]],dtype=tf.float32)
W1 = tf.random.normal(shape=(d_model,256))
b1 = tf.random.normal(shape=(256,))
W2 = tf.random.normal(shape=(256,d_model))
b2 = tf.random.normal(shape=(d_model,))

In [11]:
print(Encoder(embeddings_plus_PE,W_q,W_k,W_v,W1,b1,W2,b2))

tf.Tensor(
[[ 0.25423634 -1.6654139   0.9937296   0.4174478 ]
 [ 0.21969035 -1.6718075   0.499704    0.95241326]
 [ 0.18654864 -1.5957975   1.1659479   0.24330103]], shape=(3, 4), dtype=float32)


In [12]:
d_model=embeddings_plus_PE[0].shape[0]
encoder_params = []
for _ in range(6):
    params = {
        "W_q": tf.random.normal(shape=(d_model,d_model)),
        "W_k": tf.random.normal(shape=(d_model,d_model)),
        "W_v": tf.random.normal(shape=(d_model,d_model)),
        "W1": tf.random.normal(shape=(d_model,256)),
        "b1": tf.random.normal(shape=(256,)),
        "W2": tf.random.normal(shape=(256,d_model)),
        "b2": tf.random.normal(shape=(d_model,))
    }
    encoder_params.append(params)


In [13]:
def encoderStack(x, encoder_params):
    for layer in encoder_params:
        x = Encoder(
            x,
            layer["W_q"],
            layer["W_k"],
            layer["W_v"],
            layer["W1"],
            layer["b1"],
            layer["W2"],
            layer["b2"]
        )
    return x


In [36]:
encoder_output=(encoderStack(embeddings_plus_PE,encoder_params))

In [37]:
encoder_output

<tf.Tensor: shape=(3, 4), dtype=float32, numpy=
array([[ 1.7191175 , -0.41607088, -0.75771856, -0.5453281 ],
       [ 1.7028332 , -0.3062513 , -0.82348776, -0.5730942 ],
       [ 1.6999527 , -0.28766042, -0.82888055, -0.5834118 ]],
      dtype=float32)>

Decoder:


In [None]:
def masked_Attention(embeddings, W_q, W_k, W_v,mask):
    numberOfWords = tf.shape(embeddings)[0]

    Q, K, V = [], [], []
    for i in range(numberOfWords):
        x_i = tf.expand_dims(embeddings[i], 0)
        Q.append(tf.matmul(x_i, W_q))
        K.append(tf.matmul(x_i, W_k))
        V.append(tf.matmul(x_i, W_v))

    d_k = tf.math.sqrt(tf.cast(tf.shape(K[0])[1], tf.float32))

    attention_outputs = []

    for i in range(numberOfWords):
        score_list = []
        for j in range(numberOfWords):
            score = tf.matmul(Q[i], K[j], transpose_b=True)[0, 0] / d_k
            score=score+mask[i][j]
            score_list.append(score)

        scores = tf.stack(score_list)
        weights = tf.nn.softmax(scores)

        output = tf.zeros_like(V[0])
        for j in range(numberOfWords):
            output += weights[j] * V[j]

        attention_outputs.append(output[0])

    return tf.stack(attention_outputs)


In [44]:
def Encoder_decoder_attention(encoder_output,prev_layer_output,W_q,W_k,W_v):
    numberOfWords = tf.shape(encoder_output)[0]

    Q, K, V = [], [], []
    for i in range(numberOfWords):
        q_i = tf.expand_dims(prev_layer_output[i], 0)
        enc_i = tf.expand_dims(encoder_output[i], 0)
        Q.append(tf.matmul(q_i, W_q))
        K.append(tf.matmul(enc_i, W_k))
        V.append(tf.matmul(enc_i, W_v))

    d_k = tf.math.sqrt(tf.cast(tf.shape(K[0])[1], tf.float32))

    attention_outputs = []

    for i in range(numberOfWords):
        score_list = []
        for j in range(numberOfWords):
            score = tf.matmul(Q[i], K[j], transpose_b=True)[0, 0] / d_k
            score_list.append(score)

        scores = tf.stack(score_list)
        weights = tf.nn.softmax(scores)

        output = tf.zeros_like(V[0])
        for j in range(numberOfWords):
            output += weights[j] * V[j]

        attention_outputs.append(output[0])

    return tf.stack(attention_outputs)
    

In [48]:
def decoder(positional_encodings_plus_embeddings,encoder_output,W_q_self,W_k_self,W_v_self,W_q_cross,W_k_cross,W_v_cross,mask,W1,b1,W2,b2):
    masked_self_attn_output=masked_Attention(positional_encodings_plus_embeddings,W_q_self,W_k_self,W_v_self,mask)
    add_and_normalize_1_output=Add_and_normalize(masked_self_attn_output,positional_encodings_plus_embeddings)
    encoder_decoder_attn_output=Encoder_decoder_attention(encoder_output,add_and_normalize_1_output,W_q_cross,W_k_cross,W_v_cross)
    add_and_normalize_2_output=Add_and_normalize(encoder_decoder_attn_output,add_and_normalize_1_output)
    ffn_output=FeedForwardNN(add_and_normalize_2_output,W1,b1,W2,b2)
    decoder_output=Add_and_normalize(ffn_output,add_and_normalize_2_output)
    return decoder_output

In [50]:
def causal_mask(d):
    mask = tf.linalg.band_part(tf.ones((d, d)), -1, 0)
    return (1.0 - mask) * -1e9


In [53]:
d_model=embeddings_plus_PE[0].shape[0]
mask=causal_mask(d_model)
decoder_params = []

for _ in range(6):
    params = {
        "W_q_self": tf.random.normal(shape=(d_model,d_model)),
        "W_k_self": tf.random.normal(shape=(d_model,d_model)),
        "W_v_self": tf.random.normal(shape=(d_model,d_model)),
        "W_q_cross": tf.random.normal(shape=(d_model,d_model)),
        "W_k_cross": tf.random.normal(shape=(d_model,d_model)),
        "W_v_cross": tf.random.normal(shape=(d_model,d_model)),
        "W1": tf.random.normal(shape=(d_model,256)),
        "b1": tf.random.normal(shape=(256,)),
        "W2": tf.random.normal(shape=(256,d_model)),
        "b2": tf.random.normal(shape=(d_model,))
    }
    decoder_params.append(params)

In [51]:
def decoderStack(x,decoder_params,encoder_output):
    for layer in decoder_params:
        x=decoder(x,encoder_output,layer["W_q_self"],layer["W_k_self"],layer["W_v_self"],layer["W_q_cross"],layer["W_k_cross"],layer["W_v_cross"],mask,layer["W1"],layer["b1"],layer["W2"],layer["b2"])
    return x

In [None]:
embeddings=tf.constant([[1,0,1,0],[0,1,1,0],[1,1,0,1]],dtype=tf.float32)
positional_encodings=positionalEncodings(len(embeddings))
embeddings_plus_PE=embeddings+positional_encodings

In [54]:
print(decoderStack(embeddings_plus_PE,decoder_params,encoder_output))

tf.Tensor(
[[-1.2151812   1.5449194   0.03785586 -0.367594  ]
 [-1.2151648   1.5449364   0.03781128 -0.3675828 ]
 [-1.2152151   1.5449104   0.03782833 -0.36752343]], shape=(3, 4), dtype=float32)
