<a href="https://colab.research.google.com/github/QaziSaim/DeepLearning--DeepNeuralNetwork-/blob/main/Transformer_Designing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from datasets import load_dataset
import numpy as np

### Loading Dataset

In [2]:
dataset = load_dataset('wikitext','wikitext-2-raw-v1')
text = " ".join(dataset['train']['text'])
print('Total characters:',len(text))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Total characters: 10929707


### Tokenization

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
vocab_size = len(tokenizer.word_index) + 1
print('Vocab Size ',vocab_size)
tokens = tokenizer.texts_to_sequences([text])[0]

Vocab Size  909


In [5]:
seq_length = 10
input_sequences = []
output_words = []

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i-seq_length:i])
    output_words.append(tokens[i])

input_sequences = np.array(input_sequences)
output_words = np.array(output_words)
print("Training samples:", input_sequences.shape)


Training samples: (69351, 10)


In [15]:
input_sequences.shape

(69351, 10)

In [11]:
demo_embeddings = tf.keras.layers.Embedding(vocab_size,128)

In [13]:
self_embedding = demo_embeddings(input_sequences)

In [14]:
another_self_embeddign = self_embedding * tf.math.sqrt(tf.cast())

<tf.Tensor: shape=(69351, 10, 128), dtype=float32, numpy=
array([[[ 2.39900462e-02, -1.24145746e-02,  2.57377140e-02, ...,
          1.13019943e-02,  1.83961131e-02, -3.59116085e-02],
        [ 2.39900462e-02, -1.24145746e-02,  2.57377140e-02, ...,
          1.13019943e-02,  1.83961131e-02, -3.59116085e-02],
        [-4.30196039e-02,  1.89990290e-02, -1.24235526e-02, ...,
         -3.74717005e-02, -3.73480693e-02, -3.80657800e-02],
        ...,
        [ 3.83177511e-02, -4.16037217e-02, -4.04310226e-02, ...,
          3.57087888e-02,  3.72622721e-02, -2.79692542e-02],
        [-4.30196039e-02,  1.89990290e-02, -1.24235526e-02, ...,
         -3.74717005e-02, -3.73480693e-02, -3.80657800e-02],
        [-4.30196039e-02,  1.89990290e-02, -1.24235526e-02, ...,
         -3.74717005e-02, -3.73480693e-02, -3.80657800e-02]],

       [[ 2.39900462e-02, -1.24145746e-02,  2.57377140e-02, ...,
          1.13019943e-02,  1.83961131e-02, -3.59116085e-02],
        [-4.30196039e-02,  1.89990290e-02, -1

In [6]:
import tensorflow as tf

In [7]:
def scaled_dot_product_attention(q,k,v):
  matmul = tf.matmul(q, k, transpose_b=True) # first it transpose the key vector then it perform dot product q @ k.T if k is numpy array then it works
  dk = tf.cast(tf.shape(k)[-1],tf.float32) # then the datatype of key vector  changes to float32
  scaled_logits = matmul / tf.math.sqrt(dk)
  weights = tf.nn.softmat(scaled_logits,axis = -1)
  output = tf.matmul(weights,v)
  return output

In [8]:
class MultiHeadAttention(layers.Layer):
  def __init__(self, d_model, num_heads):
    super().__init__()
    assert d_model % num_heads ==0
    self.num_heads = num_heads
    self.depth = d_model // num_heads
    self.wq = layers.Dense(d_model)
    self.wk = layers.Dense(d_model)
    self.wv = layers.Dense(d_model)
    self.dense = layers.Dense(d_model)

  def split_heads(self, x ,batch_size):
    x = tf.reshape(x,(batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x,[0, 2, 1, 3])

  def call(self, q, k, v):
    batch_size = tf.shape(q)[0]
    q = self.split_heads(self.wq(q), batch_size)
    k = self.split_heads(self.wk(k), batch_size)
    v = self.split_heads(self.wv(v), batch_size)
    attn = scaled_dot_product_attention(q, k, v)
    attn = tf.transpose(attn, [0, 2, 1, 3])
    concat = tf.reshape(attn, (batch_size, -1, self.num_heads * self.depth))
    return self.dense(concat)


In [16]:
class EncoderLayer(layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate = 0.01):
    super().__init__()
    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = tf.keras.Sequential([
        layers.Dense(dff, activation='relu'),
        layers.Dense(d_model)
    ])
    self.ln1 = layers.LayerNormalization(epsilon=1e-6)
    self.ln2 = layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = layers.Dropout(rate)
    self.dropout2 = layers.Dropout(rate)

  def call(self, x, training):
    attn = self.mha(x, x, x)
    out1 = self.ln1(x + self.dropout1(attn, training=training))
    ffn_out = self.ffn(out1)
    out2 = self.ln2(out1 + self.dropout2(ffn_out, training=training))
    return out2