<a href="https://colab.research.google.com/github/QaziSaim/Transformer-Text-Generation/blob/main/Transformer_Designing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from datasets import load_dataset
import numpy as np

### Loading Dataset

In [None]:
dataset = load_dataset('wikitext','wikitext-2-raw-v1')
text = " ".join(dataset['train']['text'])
print('Total characters:',len(text))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Total characters: 10929707


### Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='<oov>')
tokenizer.fit_on_texts(dataset['train']['text'])
vocab_size = len(tokenizer.word_index) + 1
print('Vocab Size ',vocab_size)
tokens = tokenizer.texts_to_sequences([text])[0]

Vocab Size  66008


In [None]:
seq_length = 10
input_sequences = []
output_words = []

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i-seq_length:i])
    output_words.append(tokens[i])

input_sequences = np.array(input_sequences)
output_words = np.array(output_words)
print("Training samples:", input_sequences.shape)


Training samples: (1760497, 10)


In [None]:
input_sequences.shape

(1760497, 10)

In [None]:
import tensorflow as tf

In [None]:
def scaled_dot_product_attention(q,k,v):
  matmul = tf.matmul(q, k, transpose_b=True) # first it transpose the key vector then it perform dot product q @ k.T if k is numpy array then it works
  dk = tf.cast(tf.shape(k)[-1],tf.float32) # then the datatype of key vector  changes to float32
  scaled_logits = matmul / tf.math.sqrt(dk)
  weights = tf.nn.softmax(scaled_logits,axis = -1)
  output = tf.matmul(weights,v)
  return output

In [None]:
class MultiHeadAttention(layers.Layer):
  def __init__(self, d_model, num_heads):
    super().__init__()
    assert d_model % num_heads ==0
    self.num_heads = num_heads
    self.depth = d_model // num_heads

    self.wq = layers.Dense(d_model)
    self.wk = layers.Dense(d_model)
    self.wv = layers.Dense(d_model)
    self.dense = layers.Dense(d_model)

  def split_heads(self, x ,batch_size):
    x = tf.reshape(x,(batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x,[0, 2, 1, 3])

  def call(self, q, k, v):
    batch_size = tf.shape(q)[0]
    q = self.split_heads(self.wq(q), batch_size)
    k = self.split_heads(self.wk(k), batch_size)
    v = self.split_heads(self.wv(v), batch_size)
    attn = scaled_dot_product_attention(q, k, v)
    attn = tf.transpose(attn, [0, 2, 1, 3])
    concat = tf.reshape(attn, (batch_size, -1, self.num_heads * self.depth))
    return self.dense(concat)


In [None]:
class EncoderLayer(layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate = 0.01):
    super().__init__()
    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = tf.keras.Sequential([
        layers.Dense(dff, activation='relu'),
        layers.Dense(d_model)
    ])
    self.ln1 = layers.LayerNormalization(epsilon=1e-6)
    self.ln2 = layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = layers.Dropout(rate)
    self.dropout2 = layers.Dropout(rate)

  def call(self, x, training):
    attn = self.mha(x, x, x)
    out1 = self.ln1(x + self.dropout1(attn, training=training))
    ffn_out = self.ffn(out1)
    out2 = self.ln2(out1 + self.dropout2(ffn_out, training=training))
    return out2

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, vocab_size, num_layers, d_model, num_heads, dff, rate=0.1):
    super().__init__()
    self.embedding = layers.Embedding(vocab_size,d_model)
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
    self.dropout = layers.Dropout(rate=rate)
    self.final = layers.Dense(vocab_size)

  def call(self, x, training=False):
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.embedding.output_dim, tf.float32))
    x = self.dropout(x, training=training)
    for layer in self.enc_layers:
      x = layer(x, training=training)
    x = self.final(x)
    return x[:,-1,:]

In [None]:
model = Transformer(vocab_size,num_layers=3,d_model=256, num_heads=8, dff=1024)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
X = input_sequences
y = output_words

In [None]:
X.shape,y.shape

((1760497, 10), (1760497,))

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor="val_loss",
    patience=5,
    restore_best_weights=True,
)

In [None]:
from sklearn.model_selection import train_test_split
X_main,X_test,y_main,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_main,y_main,test_size=0.25,random_state=42)

In [None]:
print('shapes')
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
print(X_val.shape,y_val.shape)

shapes
(1056297, 10) (1056297,)
(352100, 10) (352100,)
(352100, 10) (352100,)


In [None]:
# model.fit(input_sequences, output_words, batch_size=64, epochs=5)
model.fit(X_train,y_train,batch_size=64,epochs=50,validation_data=(X_val,y_val),callbacks=[es])

Epoch 1/50


ResourceExhaustedError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start

  File "/usr/local/lib/python3.12/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.12/asyncio/base_events.py", line 645, in run_forever

  File "/usr/lib/python3.12/asyncio/base_events.py", line 1999, in _run_once

  File "/usr/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 499, in process_one

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute

  File "/usr/local/lib/python3.12/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/tmp/ipython-input-1386737319.py", line 2, in <cell line: 0>

  File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 377, in fit

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 220, in function

  File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 133, in multi_step_on_iterator

Out of memory while trying to allocate 4342677504 bytes.
	 [[{{node StatefulPartitionedCall}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_multi_step_on_iterator_13045]

In [None]:
model.summary()

In [None]:
loss, accuracy = model.evaluate(X_test,y_test)
print(f'Test Accuracy {accuracy:.4f}')

[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.0025 - loss: 6.8170
Test Accuracy 0.0028


In [None]:
print(loss)

6.813540935516357


In [None]:
def generate_text(seed_text, next_words=30):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0))

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

print(generate_text("machine learning is"))


machine learning is g g g g g g g g g g g g g g g g g g g g g g g g g g g g g g
