In [1]:
import numpy as np
import math

In [2]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import tensorflow as tf




In [3]:
paths = [str(x) for x in Path('../prompts_dataset/').glob('*.txt')]

# Tokenizer

In [4]:
tokenizer = ByteLevelBPETokenizer()

In [5]:
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2, special_tokens= [
    '<s>', '</s>', '<pad>', '<unk>', '<mask>', '<lvl1>', '</lvl1>', '<lvl2>', '</lvl2>', '<lvl3>', '</lvl3>'
])

In [6]:
tokenizer.save_model('neublla_codex')

['neublla_codex\\vocab.json', 'neublla_codex\\merges.txt']

In [7]:
print(tokenizer.encode("Open word and search about palastine").tokens)

['Open', 'Ġword', 'Ġand', 'Ġs', 'e', 'arch', 'Ġabout', 'Ġp', 'al', 'a', 'st', 'ine']


Vocab size

In [8]:
tokenizer.get_vocab_size()

1172

In [9]:
token_embed = tf.keras.layers.Embedding(tokenizer.get_vocab_size(), 4)




one input sample text preprocessing steps

In [10]:
sample_text = "Open Word and write in it summerization about palastine"

sample_text_encode = tokenizer.encode(sample_text)
sample_text_tokens = sample_text_encode.tokens
sample_text_tokens_ids = sample_text_encode.ids
sample_text_tokens_seq = np.array(sample_text_tokens_ids)

In [11]:
print("Sample text encoding info")
print(sample_text_encode)
print("Sample text tokens")
print(sample_text_tokens)
print("Sample text tokens ids")
print(sample_text_tokens_ids)
print("Sample text tokens seq")
print(sample_text_tokens_seq)

Sample text encoding info
Encoding(num_tokens=19, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Sample text tokens
['Open', 'Ġ', 'W', 'ord', 'Ġand', 'Ġwrite', 'Ġin', 'Ġit', 'Ġsum', 'm', 'er', 'iz', 'ation', 'Ġabout', 'Ġp', 'al', 'a', 'st', 'ine']
Sample text tokens ids
[312, 231, 65, 834, 276, 485, 316, 428, 327, 87, 299, 356, 340, 511, 281, 286, 75, 288, 345]
Sample text tokens seq
[312 231  65 834 276 485 316 428 327  87 299 356 340 511 281 286  75 288
 345]


In [12]:
token_embed = tf.keras.layers.Embedding(tokenizer.get_vocab_size(), 4)
token_embeddngs = token_embed(sample_text_tokens_seq)

print("Embedding for the sample text : ", sample_text)
print(token_embeddngs)

Embedding for the sample text :  Open Word and write in it summerization about palastine
tf.Tensor(
[[ 0.04135612 -0.00572587  0.03753925  0.03605526]
 [ 0.01498193  0.01342216  0.01358687 -0.03752574]
 [ 0.0326019  -0.00050863  0.01794283 -0.04231886]
 [ 0.04451884 -0.01857741 -0.04477407 -0.00033475]
 [ 0.04417815 -0.00697638 -0.0224834  -0.04487335]
 [ 0.04052586  0.04054067  0.04665941 -0.00291703]
 [-0.04508409  0.03455592  0.02227939 -0.00777333]
 [ 0.04890242 -0.03603184  0.00567744  0.016881  ]
 [-0.01856792  0.04017048 -0.01840692  0.04524991]
 [-0.04305512  0.01641395  0.01962784  0.04297377]
 [-0.0237705  -0.03859905 -0.0329558   0.04898859]
 [ 0.02387314 -0.03762925  0.00668324 -0.01946318]
 [ 0.00535963 -0.04968458  0.0174357   0.03351486]
 [ 0.00837679 -0.04752765 -0.04236186 -0.02681269]
 [ 0.04558358  0.03740418  0.044698   -0.03811712]
 [ 0.04007835  0.00682516 -0.03968145 -0.02213687]
 [ 0.02645295 -0.0049573  -0.03480594  0.01608089]
 [ 0.03688603 -0.04027524 -0.0153

In [13]:
max_sequnce_length = 256
positional_embedding = tf.keras.layers.Embedding(max_sequnce_length, 4)

position_index = tf.range(len(sample_text_tokens_seq))
print(position_index)

tf.Tensor([ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18], shape=(19,), dtype=int32)


In [14]:
positional_embeddings = positional_embedding(position_index)
print("Position embeddings for the input sequence \n", positional_embeddings)

Position embeddings for the input sequence 
 tf.Tensor(
[[-0.00544344 -0.02864941  0.02475016 -0.02316974]
 [-0.04755345 -0.01766795  0.02219016 -0.02556071]
 [ 0.01220468 -0.00049644  0.02757603  0.00976502]
 [-0.02611597 -0.00513328  0.03445197  0.00394219]
 [-0.03646388  0.01631213 -0.00366051 -0.04461161]
 [ 0.01634559 -0.01496141 -0.0442906  -0.03573897]
 [-0.0434481   0.0332484   0.04691509  0.03815332]
 [-0.01728294 -0.03424324 -0.00068649  0.00536634]
 [-0.04481184  0.00757898 -0.04713923  0.03090293]
 [ 0.04719767 -0.02100389  0.01492533  0.00744049]
 [ 0.03493549  0.01961224 -0.04319534 -0.0047025 ]
 [ 0.0470147   0.0156523  -0.01025097 -0.01084021]
 [-0.00425587 -0.02942917 -0.0305195   0.04409963]
 [-0.00424488 -0.03441774  0.04994818 -0.02517501]
 [ 0.03513424 -0.03474926  0.00937317  0.01592946]
 [-0.04132128 -0.02816194  0.01747085  0.02480358]
 [-0.01298972  0.01999925  0.02672352 -0.01542353]
 [ 0.03422523  0.01432257 -0.03386471  0.01039493]
 [-0.00597199  0.04958165 

In [15]:
input = token_embeddngs + positional_embeddings
print("Input to the initial encoder block : \n", input)

Input to the initial encoder block : 
 tf.Tensor(
[[ 0.03591268 -0.03437529  0.06228942  0.01288551]
 [-0.03257152 -0.00424579  0.03577703 -0.06308645]
 [ 0.04480659 -0.00100506  0.04551887 -0.03255384]
 [ 0.01840287 -0.02371069 -0.01032209  0.00360744]
 [ 0.00771428  0.00933575 -0.02614391 -0.08948496]
 [ 0.05687145  0.02557926  0.00236881 -0.038656  ]
 [-0.08853219  0.06780431  0.06919448  0.03038   ]
 [ 0.03161948 -0.07027508  0.00499095  0.02224734]
 [-0.06337976  0.04774946 -0.06554614  0.07615285]
 [ 0.00414255 -0.00458994  0.03455316  0.05041425]
 [ 0.01116499 -0.01898681 -0.07615115  0.04428609]
 [ 0.07088784 -0.02197694 -0.00356773 -0.0303034 ]
 [ 0.00110376 -0.07911376 -0.0130838   0.07761449]
 [ 0.00413191 -0.08194539  0.00758633 -0.0519877 ]
 [ 0.08071782  0.00265492  0.05407117 -0.02218766]
 [-0.00124292 -0.02133678 -0.0222106   0.00266672]
 [ 0.01346323  0.01504196 -0.00808243  0.00065736]
 [ 0.07111125 -0.02595267 -0.04918033  0.00272284]
 [ 0.00961767  0.08147581  0.074

Batch input preprocessing steps

In [16]:
# Batching
input_batch = [
    'Open Word and save it in "Home directory" as "my_word_file"',
    'Connect the wifi to "Aizen-sama" network',
    'Search about palastine new today and give me a summary about it'
]


In [17]:
# encode the bache
input_batch_encodeing = tokenizer.encode_batch(input_batch)

# input sequences
input_seqs = []

input_seqs.append(input_batch_encodeing[0].ids)
input_seqs.append(input_batch_encodeing[1].ids)
input_seqs.append(input_batch_encodeing[2].ids)

print("Vectorized inputs : \n")
print(input_seqs)

# padding the inputs to be in the same length
padded_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(input_seqs, padding="post")
print("input to the encoder is : ")
print(padded_input_seqs.shape)
print(padded_input_seqs)

Vectorized inputs : 

[[312, 231, 65, 834, 276, 1011, 428, 316, 231, 12, 50, 858, 290, 83, 278, 366, 411, 12, 998, 231, 12, 87, 99, 73, 97, 834, 73, 80, 505, 12], [45, 269, 88, 529, 293, 317, 83, 80, 83, 342, 231, 12, 43, 356, 275, 23, 93, 75, 963, 12, 369, 347, 97, 477], [325, 511, 281, 286, 75, 288, 345, 464, 342, 78, 612, 276, 491, 83, 326, 463, 267, 354, 511, 428]]
input to the encoder is : 
(3, 30)
[[ 312  231   65  834  276 1011  428  316  231   12   50  858  290   83
   278  366  411   12  998  231   12   87   99   73   97  834   73   80
   505   12]
 [  45  269   88  529  293  317   83   80   83  342  231   12   43  356
   275   23   93   75  963   12  369  347   97  477    0    0    0    0
     0    0]
 [ 325  511  281  286   75  288  345  464  342   78  612  276  491   83
   326  463  267  354  511  428    0    0    0    0    0    0    0    0
     0    0]]


In [18]:
encoder_mask = tf.cast(tf.math.not_equal(padded_input_seqs, 0), tf.float32)
print('padded input : ')
print(padded_input_seqs, '\n')
print("Encoder mask : ")
print(encoder_mask)

padded input : 
[[ 312  231   65  834  276 1011  428  316  231   12   50  858  290   83
   278  366  411   12  998  231   12   87   99   73   97  834   73   80
   505   12]
 [  45  269   88  529  293  317   83   80   83  342  231   12   43  356
   275   23   93   75  963   12  369  347   97  477    0    0    0    0
     0    0]
 [ 325  511  281  286   75  288  345  464  342   78  612  276  491   83
   326  463  267  354  511  428    0    0    0    0    0    0    0    0
     0    0]] 

Encoder mask : 
tf.Tensor(
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]], shape=(3, 30), dtype=float32)


In [19]:
# expanded dimenstion of the mask
encoder_mask = encoder_mask[:, tf.newaxis, tf.newaxis, :]
encoder_mask

<tf.Tensor: shape=(3, 1, 1, 30), dtype=float32, numpy=
array([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]]],
      dtype=float32)>

# Encoder 

#### Multi-Head Self-Attention

Q => Queries <br>
K => Keysz   <br>
V => Values  <br>

Attention (Q, K, V) = softmax( (Q* K**T) / (sqrt(dimension_of_K) ) ) * V

In [20]:
def scaled_dot_product_attention(query, key, value, mask=None):

    key_dimension = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dimension)

    if mask is not None:
        scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)
    
    softmax = tf.keras.layers.Softmax()
    weights = softmax(scaled_scores)

    return tf.matmul(weights, value), weights

## Testing scaled_dot_product_attention

In [21]:
seq_len = 3
embed_dim = 4

queries = np.random.rand(seq_len, embed_dim)
keys = np.random.rand(seq_len, embed_dim)
values = np.random.rand(seq_len, embed_dim)

print("Queries:\n", queries)

Queries:
 [[0.80500383 0.03377964 0.13195879 0.84726906]
 [0.45987372 0.72889927 0.28293487 0.38553415]
 [0.53535543 0.47184523 0.59735394 0.28063174]]


In [22]:
output, attn_weights = scaled_dot_product_attention(queries, keys, values)

print("Output\n", output, "\n")
print("Weights\n", attn_weights)

Output
 tf.Tensor(
[[0.5502248  0.72442263 0.8154087  0.63397896]
 [0.543003   0.70325315 0.8146995  0.6312506 ]
 [0.5385741  0.6985322  0.81817216 0.6330553 ]], shape=(3, 4), dtype=float32) 

Weights
 tf.Tensor(
[[0.28666583 0.34785348 0.36548063]
 [0.2609815  0.38686407 0.3521544 ]
 [0.26998356 0.39809936 0.33191705]], shape=(3, 3), dtype=float32)


## MHSA

In [23]:
class MultHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, dimension_model, num_heads):
        super(MultHeadSelfAttention, self).__init__()
        self.dimension_model = dimension_model
        self.num_heads = num_heads

        self.dimension_head = self.dimension_model // self.num_heads

        self.query_weights = tf.keras.layers.Dense(self.dimension_model)
        self.key_weights = tf.keras.layers.Dense(self.dimension_model)
        self.value_weights = tf.keras.layers.Dense(self.dimension_model)

        self.dense = tf.keras.layers.Dense(self.dimension_model)
    
    def split_heads(self, x):
        batch_size = x.shape[0]

        split_inputs = tf.reshape(x, (batch_size, -1, self.num_heads, self.dimension_head))
        return tf.transpose(split_inputs, prem=[0, 2, 1, 3])
    
    def merge_heads(self, x):
        batch_size = x.shape[0]

        merge_inputs = tf.transpose(x, perm=[0, 2, 1 ,3])
        return tf.reshape(merge_inputs, (batch_size, -1, self.dimension_model))
    
    def call(self, q, k, v, mask):
        qs = self.query_weights(q)
        ks = self.key_weights(k)
        ws = self.value_weights(v)

        output, attention_weights = scaled_dot_product_attention(qs, ks, ws, mask)
        output = self.merge_heads(output)

        return self.dense(output), attention_weights

## Testing MHSA

In [24]:
batch_size = 1
seq_len = 3
embed_dim = 12
num_heads = 3
head_dim = embed_dim // num_heads

print(f"Dimension of each head: {head_dim}")

Dimension of each head: 4


In [25]:
x = np.random.rand(batch_size, seq_len, embed_dim).round(1)
print("Input shape: ", x.shape, "\n")
print("Input:\n", x)

Input shape:  (1, 3, 12) 

Input:
 [[[0.5 0.5 0.5 0.3 0.5 0.9 0.5 0.6 0.7 0.7 0.8 0.8]
  [0.8 0.9 0.5 0.7 0.7 0.6 0.6 0.1 0.6 0.2 0.3 1. ]
  [0.8 0.2 0.5 0.7 0.5 0.9 0.8 0.6 0.3 0.3 0.5 0.7]]]


In [26]:
mhsa = MultHeadSelfAttention(12, 3)

output, attn_weights = mhsa(x, x, x, None)
print(f"MHSA output{output.shape}:")
print(output)

MHSA output(1, 3, 12):
tf.Tensor(
[[[-1.2071465  -0.06055957  0.10553324  0.26423037 -0.31229752
   -1.1887401  -0.05269921  0.3778549  -0.8093019  -0.63126004
   -2.1432676   0.43046826]
  [-1.2079288  -0.05357718  0.10137948  0.25435773 -0.30541348
   -1.1914468  -0.04718618  0.36833608 -0.81488645 -0.6340164
   -2.1403008   0.4294031 ]
  [-1.2087305  -0.05903447  0.10013899  0.2558182  -0.30636075
   -1.1827621  -0.04979318  0.37264413 -0.8032694  -0.63422287
   -2.134417    0.42795324]]], shape=(1, 3, 12), dtype=float32)


# FFN

In [27]:
def feed_forward_network(dimension_model, hidden_dimension):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(hidden_dimension, activation='relu'),
        tf.keras.layers.Dense(dimension_model)
    ])

## Encoder Block

In [28]:
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, dimension_model, num_heads, hidden_dimension, dropout_rate=0.1):
        super(EncoderBlock, self).__init__()

        self.mhsa = MultHeadSelfAttention(dimension_model, num_heads)
        self.ffn = feed_forward_network(dimension_model, hidden_dimension)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
    
    def call(self, x, training, mask):
        mhsa_output, attention_weights = self.mhsa(x, x, x, mask)
        # drop out
        mhsa_output = self.dropout1(mhsa_output, training=training)
        # skip connection
        mhsa_output = self.layernorm1(x + mhsa_output)

        ffn_output = self.ffn(mhsa_output)
        ffn_output = self.dropout2(ffn_output, training=training)
        output = self.layernorm2(mhsa_output + ffn_output)

        return output, attention_weights

## Encoder

In [29]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_blocks, dimension_model, num_heads, hidden_dimension, src_vocab_size, max_seq_len, dropout_rate=0.1):
        super(Encoder, self).__init__()

        self.dimension_model = dimension_model
        self.max_sql_len = max_seq_len

        self.token_embedding = tf.keras.layers.Embedding(src_vocab_size, self.dimension_model)
        self.positonal_embedding = tf.keras.layers.Embedding(max_seq_len, self.dimension_model)

        self.dropout = tf.keras.layers.Dropout(dropout_rate)

        self.blocks = [EncoderBlock(self.dimension_model, num_heads, hidden_dimension, dropout_rate)
                       for _ in range(num_blocks)]
    
    def call(self, input, training, mask):
        token_embeddings = self.token_embedding(input)

        num_pos = input.shape[0] * self.max_sql_len
        positional_index = np.resize(np.arange(self.max_sql_len), num_pos)
        positional_index = np.reshape(positional_index, input.shape)
        positional_embeddings = self.positonal_embedding(positional_index)

        x = self.dropout(token_embeddings + positional_embeddings, training=training)

        for block in self.blocks:
            x, weights = block(x, training, mask)
        
        return x, weights

## Testing the encoder

In [30]:
num_blocks = 6

dimension_model = 12

num_heads = 3

hidden_dimension = 48

src_vocab_size = tokenizer.get_vocab_size()

max_seq_len = padded_input_seqs.shape[1]

encoder = Encoder(
    num_blocks,
    dimension_model,
    num_heads,
    hidden_dimension,
    src_vocab_size,
    max_seq_len
)

In [31]:
encoder_output, attn_wieghts = encoder(input=padded_input_seqs, training=True, mask=encoder_mask)

print(f"Encoder Output {encoder_output.shape}:")
print(encoder_output)


Encoder Output (3, 30, 12):
tf.Tensor(
[[[ 0.86667323  0.6334505  -0.8810883  ...  1.5660397  -2.4376056
   -0.45869628]
  [ 1.3837068   0.93641776 -1.096625   ...  1.6101617  -0.43953797
   -0.93298656]
  [ 0.82491654  0.9677865  -1.3013407  ...  1.1717308  -1.8046833
   -0.3565267 ]
  ...
  [ 0.83404845  0.8141851  -0.34106517 ...  1.6112067  -1.835874
   -0.7375326 ]
  [ 0.8244405   0.542211   -0.92117625 ...  1.5226535  -2.0762346
   -1.09233   ]
  [ 0.73770005  1.0138857  -1.0542717  ...  1.9158467  -1.9500427
   -0.7627504 ]]

 [[-0.61692566  1.1565527   0.39518926 ... -0.9645132  -1.7991585
   -0.6586123 ]
  [-1.2369397   0.13564107  0.48053375 ...  0.3204566  -2.5138562
   -0.43897542]
  [-2.1387205   0.31437173 -0.26597062 ...  0.11428353 -1.1958406
    0.14232978]
  ...
  [-0.66300243  1.5214471   0.23052539 ... -0.05507722 -1.3478061
   -0.13909958]
  [-0.3794233   1.2180223  -0.06938656 ...  0.9477912  -0.45573404
   -1.1960592 ]
  [-1.3385246   0.90298843 -0.12966353 ... 