## Extracting dataset

In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-11-28 19:33:15--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-11-28 19:33:16 (22.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## Tokenizer

In [4]:
encoder = {ch:i for i,ch in enumerate(chars)}
decoder = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [encoder[c] for c in s]
decode = lambda l: ''.join([decoder[i] for i in l])

decode(encode("hey"))

'hey'

In [5]:
import tensorflow as tf
import numpy as np

data = tf.convert_to_tensor(encode(text), dtype=tf.int32)
print(data[:100])

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int32)


## Train/Val split

In [6]:
train_size = int(len(data) * 0.9)
train_data = data[:train_size]
val_data = data[train_size:]

## CBOW

In [7]:
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = tf.random.uniform((batch_size,), minval=0, maxval=len(data) - block_size, dtype=tf.int32)
    x = tf.stack([data[i:i+block_size] for i in ix])
    y = tf.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Shape XB")
print(xb.shape)
print("Shape YB")
print(yb.shape)
print('inputs:')
print(xb.numpy())
print('targets:')
print(yb.numpy())

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.numpy()} the target: {target}")

Shape XB
(4, 8)
Shape YB
(4, 8)
inputs:
[[52 49  1 40 63  1 53 59]
 [ 1 45 56 53 61 52  1 40]
 [59 56  0 45 56 43 39 58]
 [47 52 47 59 57  1 58 53]]
targets:
[[49  1 40 63  1 53 59 56]
 [45 56 53 61 52  1 40 53]
 [56  0 45 56 43 39 58 52]
 [52 47 59 57  1 58 53  1]]
when input is [52] the target: 49
when input is [52 49] the target: 1
when input is [52 49  1] the target: 40
when input is [52 49  1 40] the target: 63
when input is [52 49  1 40 63] the target: 1
when input is [52 49  1 40 63  1] the target: 53
when input is [52 49  1 40 63  1 53] the target: 59
when input is [52 49  1 40 63  1 53 59] the target: 56
when input is [1] the target: 45
when input is [ 1 45] the target: 56
when input is [ 1 45 56] the target: 53
when input is [ 1 45 56 53] the target: 61
when input is [ 1 45 56 53 61] the target: 52
when input is [ 1 45 56 53 61 52] the target: 1
when input is [ 1 45 56 53 61 52  1] the target: 40
when input is [ 1 45 56 53 61 52  1 40] the target: 53
when input is [59] the ta

## Bigram Model

In [9]:
vocab_size

65

In [10]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Model
import numpy as np

class BigramLanguageModel(Model):
    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        # Embedding layer: Maps tokens to logits
        self.token_embedding_table = Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None):
        """
        Forward pass for training and inference.
        - idx: Input token indices (batch_size, seq_length)
        - targets: Target token indices (optional for training)
        """
        # (batch_size, seq_length, vocab_size)
        logits = self.token_embedding_table(idx)

        loss = None
        if targets is not None:
            # Reshape logits and targets for loss computation
            batch_size, seq_length, vocab_size = logits.shape
            logits = tf.reshape(logits, (batch_size * seq_length, vocab_size))
            targets = tf.reshape(targets, (batch_size * seq_length,))
            loss = tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True)

        return logits, tf.reduce_mean(loss) if loss is not None else None

    def generate(self, idx, max_new_tokens):
        """
        Generate a sequence of tokens given a starting context.
        - idx: Input token indices (batch_size, seq_length)
        - max_new_tokens: Number of tokens to generate
        """
        for _ in range(max_new_tokens):
            logits, _ = self.call(idx)
            logits = logits[:, -1, :]
            probs = tf.nn.softmax(logits, axis=-1)
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1)
            idx = tf.concat([idx, idx_next], axis=1)

        return idx




In [14]:
model = BigramLanguageModel(vocab_size)

# Define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

epochs = 10000
for epoch in range(epochs):
    xb, yb = get_batch('train')  # Get a batch of input and target tokens
    with tf.GradientTape() as tape:
        logits, loss = model(xb, yb)  # Forward pass
    gradients = tape.gradient(loss, model.trainable_variables)  # Compute gradients
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))  # Update weights

    print(f"Epoch {epoch+1}, Loss: {loss.numpy():.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 5006, Loss: 2.8092
Epoch 5007, Loss: 2.7291
Epoch 5008, Loss: 2.8743
Epoch 5009, Loss: 2.9454
Epoch 5010, Loss: 2.3272
Epoch 5011, Loss: 2.5388
Epoch 5012, Loss: 3.0326
Epoch 5013, Loss: 2.3779
Epoch 5014, Loss: 2.4116
Epoch 5015, Loss: 2.9610
Epoch 5016, Loss: 2.5434
Epoch 5017, Loss: 2.7060
Epoch 5018, Loss: 2.8075
Epoch 5019, Loss: 2.8035
Epoch 5020, Loss: 2.7063
Epoch 5021, Loss: 2.5847
Epoch 5022, Loss: 2.4841
Epoch 5023, Loss: 2.4057
Epoch 5024, Loss: 2.4035
Epoch 5025, Loss: 2.5425
Epoch 5026, Loss: 2.8446
Epoch 5027, Loss: 2.7029
Epoch 5028, Loss: 2.6327
Epoch 5029, Loss: 2.7432
Epoch 5030, Loss: 2.8072
Epoch 5031, Loss: 2.4239
Epoch 5032, Loss: 2.4604
Epoch 5033, Loss: 2.6357
Epoch 5034, Loss: 2.4347
Epoch 5035, Loss: 2.7346
Epoch 5036, Loss: 2.5284
Epoch 5037, Loss: 2.8388
Epoch 5038, Loss: 2.7120
Epoch 5039, Loss: 2.9531
Epoch 5040, Loss: 3.0700
Epoch 5041, Loss: 2.7945
Epoch 5042, Loss: 2.4109
Epoch 5043

In [15]:
# Generate text from the trained model
start_idx = tf.zeros((1, 1), dtype=tf.int64)
generated_tokens = model.generate(start_idx, max_new_tokens=1000)
print("Generated tokens:", generated_tokens.numpy())

print(decode(generated_tokens[0].numpy()))

Generated tokens: [[ 0 20 43 ... 43  1 24]]

Hem muedee thy he llexer o nshyof tip-
Chathyessete b bd, t byof, muin:-paththe:
yedicther,'d SB-lendof hyolorndems HD me swo'llyodaif hinghaisthes is oof, asurd IORK:
Oxlor!SLoume GLeean hitink.
HE:
I en gureadw, Gwh: sllasasait wat, by bls iscas!ierethend, us, o kst an!'ere thme
Tomusesad, s, e n k rt nhropld chimel:
AM:
Lousitinwshe owicl setwe df yoreyee llangmo schiso cheserre we, mbb's bre ld hacare an g yongrilklishWeanghay s ll!VVCES:
Whirts t!
Wind.
Woute mo t b;o'd w-ty.
bth f I be EG
m trmaceroo CELone.
GHe fen;ZWh s, pathetce blatin, at?out. the cy thl fard thitt din in hy d, ay.
To mefonchee'-hodingharchathald s a l'stheeal Bu ledl mnor orsin inowicj?les toee CUSCESoush ESimoe ad sp itudee ind a
Ay thanak.

UEEEn'se ds thorouteandom
Fit de Loumen SHCKUSI f m;ZGSwony s!
Weip:
RMIZ, rchan?dldin th?hit bil:
Anasus;poungis bbld u IIfod!
Mchedl mpunis m ayst I:


BRLid the.
IZlich
An, prs ou arem ise are elin hitenin!STu I,
Wia fu me

## Attention block

In [8]:
B,T,C = 4,8,32
x = tf.random.uniform((B,T,C))
x.shape

TensorShape([4, 8, 32])

Creating Query/Key pair where each head looks for a particular feature. Dot product is the attention values of each token to the average of
previous ones.


In [20]:
head_size = 16

key = tf.keras.layers.Dense(head_size, use_bias=False)
query = tf.keras.layers.Dense(head_size, use_bias=False)
value = tf.keras.layers.Dense(head_size, use_bias=False)

k = key(x)
q = query(x)
v = value(x)

weight_matrix = tf.matmul(q, k, transpose_b=True)
weight_matrix.shape

TensorShape([4, 8, 8])

Performing masking to limit leakage of future tokens as part of decoder model



In [21]:
lower_triangular_matrix = tf.linalg.band_part(tf.ones((T,T)), -1, 0)
weight_matrix = tf.where(lower_triangular_matrix == 0, float('-inf'), weight_matrix)
weight_matrix = tf.nn.softmax(weight_matrix, axis=-1)

output = tf.matmul(weight_matrix, v)

In [22]:
weight_matrix[0]

<tf.Tensor: shape=(8, 8), dtype=float32, numpy=
array([[0.99999994, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.8468603 , 0.15313978, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.8108139 , 0.13924316, 0.04994293, 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.10416581, 0.01778675, 0.00464522, 0.8734022 , 0.        ,
        0.        , 0.        , 0.        ],
       [0.1310911 , 0.03161664, 0.00848828, 0.7780695 , 0.05073457,
        0.        , 0.        , 0.        ],
       [0.1293465 , 0.01908714, 0.00491672, 0.69528615, 0.08938613,
        0.0619774 , 0.        , 0.        ],
       [0.13574909, 0.05527094, 0.01643306, 0.5609069 , 0.09987061,
        0.08867197, 0.04309753, 0.        ],
       [0.11894611, 0.0305465 , 0.01819986, 0.57196546, 0.05381953,
        0.02475209, 0.08798374, 0.09378675]], dtype=float32)>