In [1]:
!pip install tensorflow



In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences,to_categorical
from tensorflow.keras.layers import Embedding,Dropout,Layer,LayerNormalization,Dense
import numpy as np


In [3]:
def load_data(file_path):
  with open(file_path,'r',encoding='utf-8') as f:
    text= f.read()
  return text
file_path='hp1.txt'
text= load_data(file_path).lower() # This is our vobaculary

# Tokenize the text
tokenizer= Tokenizer(oov_token='')
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1

# Convert text to sequences
tokens=tokenizer.texts_to_sequences([text])[0] # created an embedding
seq_len= 50

# First seq_length tokens (input): Used for training the model.
# Last token (target): Used as the label the model tries to predict.
# so total of (50 + 1) in one input_sequence index
input_sequences=[]
for i in range(seq_len,len(tokens)):
  input_sequences.append(tokens[i-seq_len:i+1])

#print(input_sequences[0])

# Pad sequences and split inputs/targets
# after this X will have inputs and y will have label for those inputs
input_sequences = np.array(pad_sequences(input_sequences,maxlen=seq_len+1,padding='pre'))
X,y= input_sequences[:,:-1],input_sequences[:,-1]

#Encoding
#One hot encoding the labels(y), Note there ae other ways for encoding like pre-trained word2vec encoding and so on
y=tf.keras.utils.to_categorical(y, num_classes=total_words)

In [4]:
total_words

6663

In [5]:
tokens # 1D array hai

[2162,
 3680,
 4,
 274,
 224,
 8,
 651,
 332,
 652,
 535,
 35,
 1268,
 5,
 164,
 20,
 21,
 35,
 1586,
 973,
 1587,
 14,
 69,
 157,
 21,
 35,
 2,
 141,
 128,
 653,
 789,
 5,
 32,
 1588,
 12,
 169,
 490,
 110,
 1416,
 142,
 21,
 68,
 55,
 909,
 25,
 505,
 1788,
 151,
 224,
 10,
 2,
 2701,
 8,
 6,
 2702,
 275,
 2703,
 140,
 183,
 1417,
 7,
 10,
 6,
 394,
 3681,
 333,
 25,
 491,
 191,
 593,
 974,
 7,
 131,
 36,
 6,
 69,
 233,
 1418,
 274,
 224,
 10,
 975,
 4,
 2704,
 4,
 17,
 343,
 689,
 2,
 594,
 3682,
 8,
 593,
 140,
 159,
 12,
 69,
 1789,
 22,
 46,
 910,
 53,
 157,
 8,
 60,
 89,
 2705,
 63,
 1419,
 3683,
 2163,
 18,
 2,
 2706,
 2,
 287,
 17,
 6,
 356,
 1090,
 275,
 93,
 4,
 12,
 49,
 1589,
 38,
 10,
 73,
 2707,
 146,
 1269,
 2,
 287,
 17,
 383,
 21,
 276,
 26,
 21,
 690,
 17,
 6,
 691,
 4,
 49,
 1590,
 976,
 10,
 20,
 2164,
 107,
 3684,
 11,
 21,
 55,
 87,
 21,
 57,
 1591,
 11,
 45,
 327,
 179,
 29,
 47,
 2,
 911,
 274,
 117,
 10,
 274,
 2708,
 731,
 26,
 21,
 221,
 564,
 31,
 790,
 234

In [6]:
len(tokens)

81022

In [7]:
input_sequences.shape

(80972, 51)

In [8]:
input_sequences

array([[2162, 3680,    4, ...,   10,    2, 2701],
       [3680,    4,  274, ...,    2, 2701,    8],
       [   4,  274,  224, ..., 2701,    8,    6],
       ...,
       [3276,  186,   88, ..., 1219,   25,   93],
       [ 186,   88,  112, ...,   25,   93,   43],
       [  88,  112, 1281, ...,   93,   43, 1204]], dtype=int32)

In [9]:
y.shape

(80972, 6663)

## Core of the Transformer model

In [10]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout

class MultiHeadAttention(Layer):
  def __init__(self,embed_dim,num_heads):
    super(MultiHeadAttention,self).__init__() # This line calls the constructor of the parent class (Layer)
    self.num_heads=num_heads # example - 8

    self.embed_dim= embed_dim # example - 512
    # embed_dim = dimension of Q, K and V before splitting into multiple dimensions
    # It is same as total dimension of the input embeddings (word embeddings)
    self.projection_dim= embed_dim // num_heads # Size of Each Attention Head's Subspace
    # Each head gets a smaller subspace of the embedding dimension
    #example - 64

    # Fully connected (dense) layers that project the input into Q,K,V
    # These layers map the input embeddings to the same embed_dim
    # These layers will be reshaped/split later to split across attention heads
    # A single large matrix multiplication is more efficient than many small ones
    # GPUs love large matrix multiplication because they are optimized for parallel computation
    # This allows TF/Keras to efficiently batch the computation, leveraging better GPU memory utilization

    self.query_dense = Dense(embed_dim) # Q determines "what to focus on/what to search"
    self.key_dense = Dense(embed_dim) # K acts as "labels" to matched with queries
    self.value_dense = Dense(embed_dim) # V holds the actual information

    self.combine_heads = Dense(embed_dim)
    # After multi-head attention is applied, the outputs from all heads are concatenated back into embed_dim

  def attention(self, query, key, value):
    #projection_dim=self.projection_dim
    scores= tf.matmul(query,key, transpose_b=True)
    scores/=tf.math.sqrt(tf.cast(self.projection_dim, tf.float32)) # converting integer to float32 tensor # tensor means multidimensional array
    # attention scores
    attention_probs= tf.nn.softmax(scores, axis=-1) # how much attention each token should give to other tokens
    # The higher the score, the more focus that token gets
    # Softmax should be applied along the keys(i.e, across the last dimension of the scores matrix)
    # Each row corresponds to a query token attending to all key tokens
    # This ensures that each query distributes it's attention to all keys properly
    # Each row sums to 1

    return tf.matmul(attention_probs, value), attention_probs
  # x - query,key,value with shape - (batch_size,seq_length,embed_dim)
  # batch_size - number of sequences being processed together in parallel(for batch processing)
  def split_heads(self,x, batch_size):
    x= tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
    return tf.transpose(x, perm= [0,2,1,3])
    # before transpose - (batch_size, seq_len, num_heads, projection_dim)
    # after transpose - (batch_size, num_heads,seq_len, projection_dim)
    # The -1 in tf.reshape is a placeholder that tells TensorFlow to automatically
    # infer that dimension's value based on the total number of elements in the tensor
    # -1 is replaced by seq_len by tensorflow

  # In TF,Keras - call(self,inputs) is a standard method used inside Layer subclasses
  # to define the forward pass of a neural network layer

  def call(self, inputs):
    query,key,value= inputs
    batch_size = tf.shape(query)[0] # (batch_size, seq_len, embed_dim)

    query = self.split_heads(self.query_dense(query), batch_size)
    key = self.split_heads(self.key_dense(key), batch_size)
    value = self.split_heads(self.value_dense(value), batch_size)
    attention,_ = self.attention(query,key,value)
    attention = tf.transpose(attention, perm= [0,2,1,3])
    # before transpose - (batch_size,num_heads,seq_len,projection_dim)
    # after transpose - (batch_size, seq_len, num_heads, projection_dim)

    concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
    # Merge all heads back into a single vector
    # (batch_size, seq_len, num_heads, projection_dim) -> (batch_size, seq_len, embed_dim)
    return self.combine_heads(concat_attention)

class TransformerBlock(Layer):
  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
    super(TransformerBlock,self).__init__()
    self.att = MultiHeadAttention(embed_dim, num_heads)
    self.ffn = tf.keras.models.Sequential([
        Dense(ff_dim, activation= 'relu'),
        Dense(embed_dim)
    ])
    # y = (x-mean)/root(variance + epsilon)
    # epsilon ensures that we never divide by zero
    # it is small enough to not affect the result but large enough to prevent instability
    self.layernorm1 = LayerNormalization(epsilon=1e-6)
    self.layernorm2 = LayerNormalization(epsilon=1e-6)
    self.dropout1 = Dropout(rate)
    self.dropout2 = Dropout(rate)
    # Note I will create 2 layers, so 2 dropout and layernorm
  def call(self, inputs, training):
    attn_output = self.att([inputs , inputs, inputs])

    # Dropout randomly deactivates some neurons during training to reduce overfitting
    # Ensure that dropout is only applied during training and not inference
    attn_output = self.dropout1(attn_output, training= training)
    out1 = self.layernorm1(inputs + attn_output)
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training= training)
    return self.layernorm2(out1 + ffn_output) # Residual Connection

class TokenAndPositionEmbedding(Layer):
  def  __init__(self, maxlen, vocab_size, embed_dim):
    super(TokenAndPositionEmbedding, self).__init__()
    self.token_emb = Embedding(input_dim = vocab_size, output_dim = embed_dim)
    self.pos_emb = Embedding(input_dim = maxlen, output_dim = embed_dim)
    # The Embedding Layer takes an integer tensor and replaces each integer with an embed_dim sized vector
    # example - positions[0,1,2,3] # Keerti Teaches at Educosys
    # after embedding - positions = [
    #   [0.2, 0.1, 0.3, 0.5, 0.6, 0.9, 0.7, 0.8],  # Position 0
    #   [0.4, 0.2, 0.1, 0.6, 0.5, 0.7, 0.9, 0.3],  # Position 1
    #   [0.5, 0.3, 0.8, 0.2, 0.7, 0.4, 0.6, 0.1],  # Position 2
    #   [0.9, 0.6, 0.2, 0.3, 0.1, 0.8, 0.4, 0.7]   # Position 3
    #]

    # initial shape of x - (batch_size, seq_len)
    # batch_size: Number of sentences in a batch
    # seq_len: Number of tokens (words) in each sentence
    # Each value in x is an integer index from 0 to vocab_size -1
    # after embedding - (batch_size, seq_len, embed_dim)

    # example - embed_dim = 8, batch_size = 2
    #     x = [
    #   [ [0.2, 0.1, 0.4, 0.3, 0.8, 0.7, 0.6, 0.9],  # Token 2
    #     [0.5, 0.3, 0.9, 0.1, 0.2, 0.6, 0.8, 0.7],  # Token 5
    #     [0.4, 0.9, 0.2, 0.3, 0.1, 0.7, 0.5, 0.6],  # Token 1
    #     [0.3, 0.8, 0.6, 0.2, 0.5, 0.9, 0.7, 0.4]   # Token 7
    #   ],  # First sentence

    #   [ [0.1, 0.6, 0.9, 0.7, 0.3, 0.5, 0.2, 0.8],  # Token 0
    #     [0.4, 0.2, 0.3, 0.9, 0.7, 0.5, 0.1, 0.6],  # Token 3
    #     [0.8, 0.5, 0.4, 0.1, 0.6, 0.3, 0.2, 0.7],  # Token 8
    #     [0.9, 0.3, 0.5, 0.7, 0.8, 0.2, 0.6, 0.1]   # Token 4
    #   ]   # Second sentence
    # ]

  def call(self, x):
    # the maximum sequence length the model can handle
    maxlen = tf.shape(x)[-1] # sets maxlen to the length of input sequence
    positions = tf.range(start=0, limit= maxlen, delta=1) # Generate [0,1,2,..., maxlen-1]
    positions = self.pos_emb(positions) # Each position index is mapped to a trainable embedding of shape(maxlen, embed_dim)
    x= self.token_emb(x) # Each token ID of x is mapped to an embedding of shape(batch_size, maxlen, embed_dim)
    return x + positions

    # x has shape (batch_size, maxlen, embed_dim)
    # positions have shape(maxlen, embed_dim)
    # But maxlen == seq_len, so positions effectively has shape(seq_len,embed_dim)
    # Tensorflow broadcasts positions across batch_size, treating it as if it were (1,seq_len, embed_dim)
    # This allows element-wise addition between x and position


The line `super(MultiHeadAttention, self).__init__()` is used to call the `__init__` method (the constructor) of the parent class of `MultiHeadAttention`. This is crucial for properly initializing the inherited features and ensuring that all necessary setup from the parent class (`tf.keras.layers.Layer` in this case) is performed before the subclass's own initialization logic runs. It ensures that the `MultiHeadAttention` class behaves correctly as a Keras Layer.

The line `attn_output = self.att([inputs, inputs, inputs])` is calling the `MultiHeadAttention` layer (`self.att`) within the `TransformerBlock`.

In a Transformer's self-attention mechanism, the Query, Key, and Value matrices are all derived from the same input sequence. Therefore, `inputs` is passed three times to the `self.att` layer, serving as:

*   **Query**: What the attention mechanism is looking for.
*   **Key**: What the attention mechanism is comparing against.
*   **Value**: The actual information to be extracted and combined.

The `attn_output` variable will then hold the result of this self-attention calculation, which is a weighted sum of the Value vectors, emphasizing the most relevant parts of the input sequence based on their relationships.

## Model the whole architecture , compile and run the training

In [11]:
# Model Parameters
embed_dim =128 # Embedding size
num_heads = 4  # Number of attention heads
ff_dim = 512   # Feed-forward layer size
maxlen= seq_len #here it is 50 defined above

# below total words = 6662(see above - basically all tokens in the text)

# Build the model
inputs = tf.keras.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, total_words, embed_dim)
x = embedding_layer(inputs)
print(x.shape)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x, training=True)
print(x.shape)
x =x[:,-1,:]
print(x.shape)
x= Dense(total_words, activation='softmax')(x)
print(x.shape)
model= tf.keras.Model(inputs=inputs, outputs=x)

# Compile the model
model.compile(optimizer='adam', loss = 'categorical_crossentropy',metrics=['accuracy'])

model.summary()

(None, 50, 128)
(None, 50, 128)
(None, 128)
(None, 6663)


In [12]:
history = model.fit(X,y, batch_size=32, epochs=10)

Epoch 1/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 6ms/step - accuracy: 0.0812 - loss: 6.5217
Epoch 2/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.1611 - loss: 5.0805
Epoch 3/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.2092 - loss: 4.2955
Epoch 4/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.2527 - loss: 3.6654
Epoch 5/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.3139 - loss: 3.1324
Epoch 6/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.3914 - loss: 2.6268
Epoch 7/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.4684 - loss: 2.2013
Epoch 8/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.5447 - loss: 1.8386
Epoch 9/10
[1m2

In [19]:
def generate_text(seed_text, next_words, max_sequence_len):
  for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen = max_sequence_len -1, padding='pre')
    predicted = model.predict(token_list,verbose=0)
    predicted_word = tokenizer.index_word[np.argmax(predicted)]
    seed_text+= ' '+ predicted_word
  return seed_text

# Generate text
seed_text = 'harry looked at'
generated_text = generate_text(seed_text, next_words=500, max_sequence_len= seq_len+1)
print(len(generated_text))
print(generated_text)

2556
harry looked at the dursleys and climbed out into the classroom it was empty except for a large classroom that afternoon harry and ron “well done it ” harry whispered “i think it’s all worked for me nobody stood up at the moment he couldn’t see himself exactly as the other sort of secret i think the ministry of magic has waited norbert ” said ron eating their way into the house for ages harry showed them how to get past fluffy…never…but… harry suddenly jumped to his feet “where’re you going ” ron said ron sleepily “i’ve heard of harry ” said ron “snape’s a team ” said hermione “i mean i’ve been to talk to the impressed at how ron and hermione had started ron and hermione had started to vibrate if she had always been touched the time — and harry had been more nervous about having a time to find the potters were involved there and i got yeh didn’t get him much good day before i think they’d just had a few seconds before he could tell him the potters mrs dursley realized that the pot

What Is Missing Compared to ChatGPT?

- Masked Attention:

ChatGPT uses causal masking so that a word cannot see future words during training. Our model uses regular attention, which allows it to see the entire sequence.

- Multiple Stacked Transformer Blocks:

ChatGPT has many layers (e.g., 12, 24, 96 layers). Our model has only one Transformer block.

- Tokenization & Byte-Pair Encoding (BPE):

ChatGPT does not use simple tokenization; it uses Byte-Pair Encoding (BPE) or WordPiece for better vocabulary handling. Our model uses basic word tokenization.

- Training on Large Datasets:

ChatGPT is trained on hundreds of GBs of text. Our model is trained on a single Harry Potter book (very limited).

- Decoding Strategies for Text Generation:

ChatGPT uses sampling (top-k, nucleus sampling) or beam search to generate text. Our model does not have a decoding strategy.