In [1]:
import numpy as np
import math

In [2]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import tensorflow as tf




In [3]:
paths = [str(x) for x in Path('../prompts_dataset/').glob('*.txt')]

# Tokenizer

In [4]:
tokenizer = ByteLevelBPETokenizer()

In [5]:
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2, special_tokens= [
    '<s>', '</s>', '<pad>', '<unk>', '<mask>', '<lvl1>', '</lvl1>', '<lvl2>', '</lvl2>', '<lvl3>', '</lvl3>'
])

In [6]:
tokenizer.save_model('neublla_codex')

['neublla_codex\\vocab.json', 'neublla_codex\\merges.txt']

In [7]:
print(tokenizer.encode("Open word and search about palastine").tokens)

['Open', 'Ġword', 'Ġand', 'Ġs', 'e', 'arch', 'Ġabout', 'Ġp', 'al', 'a', 'st', 'ine']


Vocab size

In [8]:
tokenizer.get_vocab_size()

1172

In [9]:
token_embed = tf.keras.layers.Embedding(tokenizer.get_vocab_size(), 4)




one input sample text preprocessing steps

In [10]:
sample_text = "Open Word and write in it summerization about palastine"

sample_text_encode = tokenizer.encode(sample_text)
sample_text_tokens = sample_text_encode.tokens
sample_text_tokens_ids = sample_text_encode.ids
sample_text_tokens_seq = np.array(sample_text_tokens_ids)

In [11]:
print("Sample text encoding info")
print(sample_text_encode)
print("Sample text tokens")
print(sample_text_tokens)
print("Sample text tokens ids")
print(sample_text_tokens_ids)
print("Sample text tokens seq")
print(sample_text_tokens_seq)

Sample text encoding info
Encoding(num_tokens=19, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Sample text tokens
['Open', 'Ġ', 'W', 'ord', 'Ġand', 'Ġwrite', 'Ġin', 'Ġit', 'Ġsum', 'm', 'er', 'iz', 'ation', 'Ġabout', 'Ġp', 'al', 'a', 'st', 'ine']
Sample text tokens ids
[312, 231, 65, 834, 276, 485, 316, 428, 327, 87, 299, 356, 340, 511, 281, 286, 75, 288, 345]
Sample text tokens seq
[312 231  65 834 276 485 316 428 327  87 299 356 340 511 281 286  75 288
 345]


In [12]:
token_embed = tf.keras.layers.Embedding(tokenizer.get_vocab_size(), 4)
token_embeddngs = token_embed(sample_text_tokens_seq)

print("Embedding for the sample text : ", sample_text)
print(token_embeddngs)

Embedding for the sample text :  Open Word and write in it summerization about palastine
tf.Tensor(
[[-0.03630493  0.0274328  -0.00939428 -0.02420539]
 [ 0.01618392  0.02466606 -0.03620122  0.0063921 ]
 [ 0.02877239  0.03542027 -0.03710765 -0.032242  ]
 [-0.01899385  0.03584211  0.00212605 -0.04259966]
 [ 0.02594898  0.02037451 -0.0040069  -0.02101295]
 [ 0.01231384  0.02004592  0.02444588 -0.02153223]
 [-0.04819325  0.01104289  0.00707861  0.02770778]
 [ 0.01950746  0.04034619 -0.03092113 -0.02754783]
 [-0.03130431 -0.03454777 -0.00364064  0.02853609]
 [-0.00214807 -0.02208556  0.0242735   0.00072006]
 [ 0.0302528  -0.01662111  0.03619123  0.03471193]
 [-0.01913691 -0.03676101 -0.04960042 -0.02287838]
 [ 0.0084547  -0.01431551  0.00488999 -0.02975402]
 [ 0.0152544  -0.01315112  0.04092289  0.01256461]
 [-0.04864443 -0.0039563  -0.00519235  0.04519092]
 [-0.0271754   0.0185007   0.01124263  0.03897769]
 [-0.02620639 -0.03252237 -0.00217569  0.01760075]
 [-0.01326066 -0.04830096  0.0218

In [13]:
max_sequnce_length = 256
positional_embedding = tf.keras.layers.Embedding(max_sequnce_length, 4)

position_index = tf.range(len(sample_text_tokens_seq))
print(position_index)

tf.Tensor([ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18], shape=(19,), dtype=int32)


In [14]:
positional_embeddings = positional_embedding(position_index)
print("Position embeddings for the input sequence \n", positional_embeddings)

Position embeddings for the input sequence 
 tf.Tensor(
[[-0.03884591 -0.01983836 -0.02478283 -0.0480801 ]
 [-0.04058124  0.04588704 -0.03497245 -0.00831879]
 [-0.0489208  -0.00684286 -0.03502395  0.02460959]
 [-0.00570335 -0.00302011  0.0182308   0.03927718]
 [-0.04746342 -0.01213201  0.01247581 -0.0020106 ]
 [ 0.04292064  0.00975777 -0.00591248  0.04649416]
 [ 0.03875117  0.01110225  0.04103955 -0.00164498]
 [ 0.01972629 -0.04984891  0.03763548  0.04384668]
 [ 0.021817   -0.0072284  -0.04759295 -0.02095134]
 [-0.03592795  0.04394368  0.02386128 -0.00324935]
 [ 0.01474606 -0.02054697  0.03108385 -0.03301616]
 [ 0.00983285  0.03703486  0.01827924 -0.0471835 ]
 [ 0.01684762  0.02001305 -0.03500403  0.01748348]
 [-0.00653331 -0.03454687  0.00137032  0.03025103]
 [-0.00987024  0.04271311  0.0376168  -0.01781525]
 [ 0.02537708  0.00454026  0.04553256  0.03755797]
 [-0.02649524 -0.02697958 -0.04152282 -0.01068585]
 [-0.00125985  0.01459514 -0.01035311 -0.00119312]
 [-0.039701    0.02395973 

In [15]:
input = token_embeddngs + positional_embeddings
print("Input to the initial encoder block : \n", input)

Input to the initial encoder block : 
 tf.Tensor(
[[-0.07515083  0.00759445 -0.0341771  -0.07228549]
 [-0.02439732  0.0705531  -0.07117367 -0.00192669]
 [-0.02014841  0.0285774  -0.0721316  -0.00763241]
 [-0.02469721  0.032822    0.02035685 -0.00332247]
 [-0.02151444  0.0082425   0.00846891 -0.02302355]
 [ 0.05523448  0.02980369  0.0185334   0.02496193]
 [-0.00944208  0.02214514  0.04811816  0.0260628 ]
 [ 0.03923374 -0.00950272  0.00671435  0.01629885]
 [-0.00948731 -0.04177617 -0.05123359  0.00758475]
 [-0.03807602  0.02185812  0.04813478 -0.00252929]
 [ 0.04499886 -0.03716809  0.06727508  0.00169577]
 [-0.00930406  0.00027385 -0.03132118 -0.07006188]
 [ 0.02530232  0.00569754 -0.03011404 -0.01227054]
 [ 0.00872109 -0.04769799  0.04229321  0.04281564]
 [-0.05851468  0.03875681  0.03242445  0.02737566]
 [-0.00179832  0.02304096  0.05677519  0.07653566]
 [-0.05270163 -0.05950195 -0.04369851  0.0069149 ]
 [-0.01452051 -0.03370582  0.01154616 -0.02794413]
 [-0.01984125  0.06830088  0.006

Batch input preprocessing steps

In [16]:
# Batching
input_batch = [
    'Open Word and save it in "Home directory" as "my_word_file"',
    'Connect the wifi to "Aizen-sama" network',
    'Search about palastine new today and give me a summary about it'
]


In [17]:
# encode the bache
input_batch_encodeing = tokenizer.encode_batch(input_batch)

# input sequences
input_seqs = []

input_seqs.append(input_batch_encodeing[0].ids)
input_seqs.append(input_batch_encodeing[1].ids)
input_seqs.append(input_batch_encodeing[2].ids)

print("Vectorized inputs : \n")
print(input_seqs)

# padding the inputs to be in the same length
padded_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(input_seqs, padding="post")
print("input to the encoder is : ")
print(padded_input_seqs.shape)
print(padded_input_seqs)

Vectorized inputs : 

[[312, 231, 65, 834, 276, 1011, 428, 316, 231, 12, 50, 858, 290, 83, 278, 366, 411, 12, 998, 231, 12, 87, 99, 73, 97, 834, 73, 80, 505, 12], [45, 269, 88, 529, 293, 317, 83, 80, 83, 342, 231, 12, 43, 356, 275, 23, 93, 75, 963, 12, 369, 347, 97, 477], [325, 511, 281, 286, 75, 288, 345, 464, 342, 78, 612, 276, 491, 83, 326, 463, 267, 354, 511, 428]]
input to the encoder is : 
(3, 30)
[[ 312  231   65  834  276 1011  428  316  231   12   50  858  290   83
   278  366  411   12  998  231   12   87   99   73   97  834   73   80
   505   12]
 [  45  269   88  529  293  317   83   80   83  342  231   12   43  356
   275   23   93   75  963   12  369  347   97  477    0    0    0    0
     0    0]
 [ 325  511  281  286   75  288  345  464  342   78  612  276  491   83
   326  463  267  354  511  428    0    0    0    0    0    0    0    0
     0    0]]


In [18]:
encoder_mask = tf.cast(tf.math.not_equal(padded_input_seqs, 0), tf.float32)
print('padded input : ')
print(padded_input_seqs, '\n')
print("Encoder mask : ")
print(encoder_mask)

padded input : 
[[ 312  231   65  834  276 1011  428  316  231   12   50  858  290   83
   278  366  411   12  998  231   12   87   99   73   97  834   73   80
   505   12]
 [  45  269   88  529  293  317   83   80   83  342  231   12   43  356
   275   23   93   75  963   12  369  347   97  477    0    0    0    0
     0    0]
 [ 325  511  281  286   75  288  345  464  342   78  612  276  491   83
   326  463  267  354  511  428    0    0    0    0    0    0    0    0
     0    0]] 

Encoder mask : 
tf.Tensor(
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0.]], shape=(3, 30), dtype=float32)


In [19]:
# expanded dimenstion of the mask
encoder_mask = encoder_mask[:, tf.newaxis, tf.newaxis, :]
encoder_mask

<tf.Tensor: shape=(3, 1, 1, 30), dtype=float32, numpy=
array([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]]],
      dtype=float32)>

# Encoder 

#### Multi-Head Self-Attention

Q => Queries <br>
K => Keysz   <br>
V => Values  <br>

Attention (Q, K, V) = softmax( (Q* K**T) / (sqrt(dimension_of_K) ) ) * V

In [20]:
def scaled_dot_product_attention(query, key, value, mask=None):
    key_dimension = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dimension)

    if mask is not None:
        scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)
    
    softmax = tf.keras.layers.Softmax()
    weights = softmax(scaled_scores)

    return tf.matmul(weights, value), weights

In [21]:
class MultHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, dimension_model, num_heads):
        super(MultHeadSelfAttention, self).__init__()
        self.dimension_model = dimension_model
        self.num_heads = num_heads

        self.dimension_head = self.dimension_model // self.num_heads

        self.query_weights = tf.keras.layers.Dense(self.dimension_model)
        self.key_weights = tf.keras.layers.Dense(self.dimension_model)
        self.value_weights = tf.keras.layers.Dense(self.dimension_model)

        self.dense = tf.keras.layers.Dense(self.dimension_model)
    
    def split_heads(self, x):
        batch_size = x.shape[0]

        split_inputs = tf.reshape(x, (batch_size, -1, self.num_heads, self.dimension_head))
        return tf.transpose(split_inputs, prem=[0, 2, 1, 3])
    
    def merge_heads(self, x):
        batch_size = x.shape[0]

        merge_inputs = tf.transpose(x, perm=[0, 2, 1 ,3])
        return tf.reshape(merge_inputs, (batch_size, -1, self.dimension_model))
    
    def call(self, q, k, v, mask):
        qs = self.query_weights(q)
        ks = self.key_weights(k)
        ws = self.value_weights(v)

        output, attention_weights = scaled_dot_product_attention(qs, ks, ws, mask)
        output = self.merge_heads(output)

        return self.dense(output), attention_weights

In [22]:
def feed_forward_network(dimension_model, hidden_dimension):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(hidden_dimension, activation='relu'),
        tf.keras.layers.Dense(dimension_model)
    ])

In [23]:
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, dimension_model, num_heads, hidden_dimension, dropout_rate=0.1):
        super(EncoderBlock, self).__init__()

        self.mhsa = MultHeadSelfAttention(dimension_model, num_heads)
        self.ffn = feed_forward_network(dimension_model, hidden_dimension)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
    
    def call(self, x, training, mask):
        mhsa_output, attention_weights = self.mhsa(x, x, x, mask)
        # drop out
        mhsa_output = self.dropout1(mhsa_output, training=training)
        # skip connection
        mhsa_output = self.layernorm1(x + mhsa_output)

        ffn_output = self.ffn(mhsa_output)
        ffn_output = self.dropout2(ffn_output, trainin=training)
        output = self.layernorm2(mhsa_output + ffn_output)

        return output, attention_weights

In [24]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_blocks, dimension_model, num_heads, hidden_dimension, src_vocab_size, max_seq_len, dropout_rate=0.1):
        super(Encoder, self).__init__()

        self.dimension_model = dimension_model
        self.max_sql_len = max_seq_len

        self.token_embedding = tf.keras.layers.Embedding(src_vocab_size, self.dimension_model)
        self.positonal_embedding = tf.keras.layers.Embedding(max_seq_len, self.dimension_model)

        self.dropout = tf.keras.layers.Dropout(dropout_rate)

        self.blocks = [EncoderBlock(self.dimension_model, num_heads, hidden_dimension, dropout_rate)
                       for _ in range(num_blocks)]
    
    def call(self, input, training, mask):
        token_embeddings = self.token_embedding(input)

        num_pos = input.shape[0] * self.max_sql_len
        positional_index = np.resize(np.arange(self.max_sql_len), num_pos)
        positional_index = np.reshape(positional_index, input.shape)
        positional_embeddings = self.positonal_embedding(positional_index)

        x = self.dropout(token_embeddings + positional_embeddings, training=training)

        for block in self.blocks:
            x, weights = block(x, training, mask)
        
        return x, weights

<div style="text-align:center;"><h3>declaring an encoder</h3></div>

In [25]:
num_blocks = 6

dimension_model = 12

num_heads = 3

hidden_dimension = 48

src_vocab_size = tokenizer.get_vocab_size()

max_seq_len = padded_input_seqs.shape[1]

encoder = Encoder(
    num_blocks,
    dimension_model,
    num_heads,
    hidden_dimension,
    src_vocab_size,
    max_seq_len
)

In [26]:
encoder_output, attn_wieghts = encoder(input=padded_input_seqs, training=True, mask=encoder_mask)

print(f"Encoder Output {encoder_output.shape}:")
print(encoder_output)

InvalidArgumentError: Exception encountered when calling layer 'encoder_block' (type EncoderBlock).

{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [3,30,12] vs. [3,90,12] [Op:AddV2] name: 

Call arguments received by layer 'encoder_block' (type EncoderBlock):
  • x=tf.Tensor(shape=(3, 30, 12), dtype=float32)
  • training=True
  • mask=tf.Tensor(shape=(3, 1, 1, 30), dtype=float32)