In [1]:
with open("/content/The_Verdict.txt", "r", encoding="utf-8") as f:
  text = f.read()

print(text[:100])

﻿I HAD always thought Jack Gisburn rather a cheap genius-- though a good fellow enough--so it was no


In [2]:
import importlib
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
IDs = tokenizer.encode(text)
print(IDs[:20])

[171, 119, 123, 40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 996, 257, 922]


In [3]:
import tensorflow as tf


block_size = 64

def create_tf_dataset(ids, block_size):
    ids = tf.constant(ids, dtype=tf.int32)
    dataset = tf.data.Dataset.from_tensor_slices(ids)
    dataset = dataset.window(block_size + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda x: x.batch(block_size + 1))
    dataset = dataset.map(lambda x: (x[:-1], x[1:]))
    return dataset

dataset = create_tf_dataset(IDs, block_size)

for x, y in dataset:
    print("Input:", x.numpy(),"\n", "Target:", y.numpy())
    break



Input: [  171   119   123    40   367  2885  1464  1807  3619   402   271 10899
  2138   257  7026 15632   438   996   257   922  5891  1576   438   568
   340   373   645  1049  5975   284   502   284  3285   326    11   287
   262  6001   286   465 13476    11   339   550  5710   465 12036    11
  6405   257  5527 27075    11   290  4920  2241   287   257  4489    64
   319   262 34686 41976] 
 Target: [  119   123    40   367  2885  1464  1807  3619   402   271 10899  2138
   257  7026 15632   438   996   257   922  5891  1576   438   568   340
   373   645  1049  5975   284   502   284  3285   326    11   287   262
  6001   286   465 13476    11   339   550  5710   465 12036    11  6405
   257  5527 27075    11   290  4920  2241   287   257  4489    64   319
   262 34686 41976    13]


In [4]:
batch_size = 16

batched_dataset = dataset.batch(batch_size, drop_remainder=True) #Drop last incomplete batch

In [5]:
for input_ids, target_ids in batched_dataset.take(1): #take the first batch
    print(input_ids.shape, target_ids.shape)

(16, 64) (16, 64)


In [6]:
class GPTEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, max_len, d_model):
        super().__init__()
        self.token_emb = tf.keras.layers.Embedding(vocab_size, d_model) #Lookup table of size: vocab_size × d_model, each token id maps to a vector

        self.pos_emb   = tf.keras.layers.Embedding(max_len, d_model) #lookup table of size: vocab_size x d_model, Each position index is maped to a vector
        #max_len = maximum sequence length the model can handle
        # It means: The largest number of tokens the model can process in one input sequence so max_len >= block_size, gpt2=1024, gpt3 = 2048
        #max_len = 128 then Position IDs: 0 to 127
        # when block size less than max len =max_len = 512, block size = 64 then Only positions 0–63 are used and others are ignored in forward pass



    def call(self, input_ids):#forward pass
        B, T = tf.shape(input_ids)[0], tf.shape(input_ids)[1] #Extract batch size and sequence length
        positions = tf.range(0, T) #Create position indices like [0, 1, 2, ..., T-1]
        positions = tf.expand_dims(positions, 0)  # (1, T) Expand dims like (T,)->(1,T), prepare for batch broadcasting
        positions = tf.tile(positions, [B, 1])    # (B, T) Tile accross batch, resulting shape (B,T)
        ''' [
          [0, 1, 2, 3],
          [0, 1, 2, 3]
        ] '''
        x = self.token_emb(input_ids) #Token embedding lookup, input (B,T), output shape -> (B, T, d_model) Each token ID → vector.
        p = self.pos_emb(positions) #Positional embedding lookup, input(B,T), Output shape -> (B, T, d_model)

        return x + p #(B, T, d_model) + (B, T, d_model)

#Final output (B, T, d_model) means For each token in each sequence, you now have a vector.
# x[b, t] → vector of size d_model, x[0, 0] → embedding of 1st token of 1st sentence , x[1, 4] → embedding of 5th token of 2nd sentence





In [7]:
vocab_size = tokenizer.n_vocab

embedding_layer = GPTEmbedding(
    vocab_size=vocab_size,
    max_len=block_size,
    d_model=256
)

for input_ids, target_ids in batched_dataset:
    x = embedding_layer(input_ids)

In [8]:
x[0,0] #vector of 1st token of 1st sequence

<tf.Tensor: shape=(256,), dtype=float32, numpy=
array([-1.28032565e-02, -1.21125691e-02,  2.53698602e-02,  4.59843501e-02,
       -2.76092142e-02, -2.54418589e-02, -3.05190720e-02,  7.02824742e-02,
       -1.04108229e-02, -3.79961729e-03, -7.57056102e-02,  5.65003380e-02,
        1.31392516e-02,  2.80474089e-02, -1.32741481e-02,  4.27048430e-02,
       -3.10556367e-02, -8.48719925e-02, -2.59093270e-02,  1.29586589e-02,
       -3.66173536e-02,  3.60317677e-02, -6.43103719e-02,  5.68809062e-02,
       -3.15227881e-02, -4.10348661e-02, -4.31039706e-02, -1.39475577e-02,
       -7.42787123e-02,  3.87852564e-02, -4.99856584e-02, -3.37286368e-02,
        1.33453589e-02, -2.87465677e-02,  5.08940108e-02, -8.77521634e-02,
        8.81641917e-03,  6.71612248e-02,  5.09570315e-02, -4.11628038e-02,
       -6.22373596e-02,  6.48895279e-02, -5.62416911e-02, -7.79256001e-02,
       -7.87595659e-03,  1.68758295e-02, -7.69135058e-02, -3.32330316e-02,
       -7.71699920e-02, -7.83336386e-02, -1.68330446

In [11]:
x[5,10] #  vector for 11th token of 6th sequence(0 base indexing)

# Each row contains 64 vectors because each row in the a contains 64 IDs.
#Thats why x[5,64] will show out of index

<tf.Tensor: shape=(256,), dtype=float32, numpy=
array([ 1.24114975e-02,  1.88071169e-02, -8.43556970e-03, -7.48472959e-02,
        2.42328513e-02, -4.46803719e-02,  1.80043206e-02, -4.18944508e-02,
        1.54528841e-02, -1.68267265e-02,  1.45807602e-02, -7.77342767e-02,
        8.11850652e-02, -7.71826133e-03,  7.91367739e-02, -5.75868860e-02,
        4.07451652e-02,  3.10507305e-02, -1.96940154e-02,  8.92629474e-03,
       -4.39274460e-02,  1.82956830e-03,  2.92868372e-02, -7.66075402e-02,
        5.37450612e-03,  6.52904660e-02, -4.50483896e-02, -7.82492012e-03,
       -2.03295909e-02, -1.48014426e-02,  6.52494058e-02,  7.79309273e-02,
       -8.85753408e-02, -8.69822502e-03, -4.14783582e-02, -6.09673187e-03,
        3.86983790e-02,  1.64917819e-02, -1.45010203e-02, -5.29844314e-04,
       -2.76053417e-02,  8.17303732e-03, -7.65282512e-02,  4.38488834e-03,
        6.76405206e-02,  8.90879780e-02, -5.66544421e-02,  4.81134653e-03,
        1.89237110e-02, -4.44311835e-02,  4.04871106

#What is LookUp Table
A lookup table is just a matrix. Nothing more.

For embeddings, embedding matrix = a 2D table of numbers = vocab_size x  d_model,

Embedding(vocab_size, d_model)

Each row = vector for one token ID


For positional encoding, Lookup table = max_len x d_model

Each row = vector for one position.

Embedding(max_len, d_model)

# How lookup actually works (core idea)
Embedding lookup = row indexing

No multiplication.
No dot product.
Just select rows.

#Numeric  Example
vocab_size = 5, d_model = 3

token embedding matrix:\
token_emb_table = [
  
          [ 0.10,  0.20,  0.30 ],   # token 0
          [ 0.40,  0.50,  0.60 ],   # token 1
          [ 0.70,  0.80,  0.90 ],   # token 2
          [ 1.00,  1.10,  1.20 ],   # token 3
          [ 1.30,  1.40,  1.50 ]    # token 4
]

input_ids = [2, 4, 1]

Lookup Operations, token_emb(input_ids)

Result = [

        [0.70, 0.80, 0.90],   # row 2
        [1.30, 1.40, 1.50],   # row 4
        [0.40, 0.50, 0.60]    # row 1
]

That’s it.
No computation — just indexing.

Positional embedding lookup

max_len = 6, d_model = 3

positional embedding table,
pos_emb_table = [

    [0.01, 0.02, 0.03],   # position 0
    [0.04, 0.05, 0.06],   # position 1
    [0.07, 0.08, 0.09],   # position 2
    [0.10, 0.11, 0.12],   # position 3
    [0.13, 0.14, 0.15],   # position 4
    [0.16, 0.17, 0.18]    # position 5
]

positions = [0, 1, 2]

Lookup, [

    [0.01, 0.02, 0.03],   # pos 0
    [0.04, 0.05, 0.06],   # pos 1
    [0.07, 0.08, 0.09]    # pos 2
]

#Combining token + position (final input)
token vectors = [

     [0.70, 0.80, 0.90]
     [1.30, 1.40, 1.50]
     [0.40, 0.50, 0.60]

]

position vectors = [

    [0.01, 0.02, 0.03]
    [0.04, 0.05, 0.06]
    [0.07, 0.08, 0.09]

]

Adding element wise: [

    [0.71, 0.82, 0.93],
    [1.34, 1.45, 1.56],
    [0.47, 0.58, 0.69]

]