In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import tensorflow as tf
from helper_functions import *
importTensorflow(memory=4090)
import numpy as np
import time

2.16.1
1 Physical GPUs, 1 Logical GPUs


In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

In [3]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

data = tf.constant(encode(text), dtype=tf.int64)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [4]:
tf.random.set_seed(1337)
batch_size = 64
block_size = 256

learning_rate = 3e-4
max_iters = 5000
eval_interval = 500
eval_iters = 200
n_embd = 384
n_head = 4

In [5]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = np.random.randint(low = 0, high = len(data) - block_size, size=(batch_size, ))
    x = tf.stack([data[i:i+block_size] for i in ix])
    y = tf.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [6]:
class Head(tf.keras.Model):
    def __init__(self, head_size):
        super().__init__()
        self.key = tf.keras.layers.Dense(head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(head_size, use_bias=False)
        self.tril = tf.constant(np.tril(tf.ones((block_size, block_size), dtype=tf.float32)))

    def call(self, x):
        B, T, C = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2]
        k = self.key(x)
        q = self.query(x)
        wei = q @ tf.transpose(k, perm=[0, 2, 1]) * tf.cast(C, dtype=tf.float32)**(-0.5)
        wei = tf.where(self.tril[:T,:T]==0, x=float('-inf'), y=wei)
        wei = tf.nn.softmax(wei, axis=-1)
        v = self.value(x)
        out = wei @ v
        return out

In [7]:
class MultiHeadAttention(tf.keras.Model):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.proj = tf.keras.layers.Dense(n_embd)
    
    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        out = self.proj(out)
        return out

In [8]:
class FeedForward(tf.keras.Model):
    def __init__(self, n_embd):
        super().__init__()
        self.model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(4 * n_embd),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embd)
        ])
    
    def call(self, x):
        return self.model(x)

In [9]:
class Block(tf.keras.Model):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd//n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
    
    def call(self, x):
        x = x + self.sa(x)
        x = x + self.ffwd(x)
        return x

In [10]:
class BigramLanguageModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embd)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embd)
        # self.sa_head = MultiHeadAttention(4, n_embd//4)
        # self.ffwd = FeedForward(n_embd)
        self.blocks = tf.keras.models.Sequential([
            Block(n_embd, n_head),
            Block(n_embd, n_head),
            Block(n_embd, n_head)
        ])
        self.lm_head = tf.keras.layers.Dense(vocab_size)
    
    def call(self, idx, targets=None):
        B, T = tf.shape(idx)[0], tf.shape(idx)[1]
        token_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(tf.range(T))
        
        x = token_emb + pos_emb
        # x = self.sa_head(x)
        # x = self.ffwd(x)
        x = self.blocks(x)

        logits = self.lm_head(x) # (B,T,vocab_size)
        loss = None
        if targets is not None:
            lossF = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            # print(idx.shape, targets.shape, logits.shape)
            loss = lossF(targets, logits)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_new = idx[:, -block_size:]
            logits, loss = self(idx_new)
            # print(logits.shape)
            # print(logits)
            logits = logits[:, -1, :]
            probs = tf.nn.softmax(logits, axis=-1)
            idx_next = tf.random.categorical(probs, num_samples = 1)
            idx = tf.concat([idx, idx_next], axis=-1)
            # print(_, idx)
        return idx

In [11]:
m = BigramLanguageModel()
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [12]:
@tf.function
def train_step(xb, yb):
    with tf.GradientTape() as tape:
        logits, loss = m(xb, yb)
    gradients = tape.gradient(loss, m.trainable_variables)
    optimizer.apply_gradients(zip(gradients, m.trainable_variables))
    return loss

In [13]:
def estimate_loss(model, eval_iters, get_batch):
    results = {}
    for split in ['train', 'val']:
        losses = []
        for _ in range(eval_iters):
            xb, yb = get_batch(split)
            _, loss = model(xb, yb)
            losses.append(loss.numpy())
        results[split] = tf.reduce_mean(losses)
    return results

In [14]:
for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss(m, eval_iters, get_batch)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    loss = train_step(xb, yb)

step 0: train loss 4.1757, val loss 4.1755
step 500: train loss 3.3022, val loss 3.3435
step 1000: train loss 2.7322, val loss 2.7180
step 1500: train loss 2.1906, val loss 2.2262
step 2000: train loss 2.0946, val loss 2.1507
step 2500: train loss 2.0122, val loss 2.0869
step 3000: train loss 1.9543, val loss 2.0571
step 3500: train loss 1.9313, val loss 2.0455
step 4000: train loss 1.8887, val loss 2.0101
step 4500: train loss 1.8640, val loss 2.0083


In [18]:
context = tf.zeros((1, 1), dtype=tf.int64)
generated_sequence = m.generate(context, max_new_tokens=500)
print(decode(generated_sequence[0].numpy()))


rS:A.DRAoB$'ASG;;r SR3!H!NlF?gcoIheO; O-fE
 tNTHG,Z!NKM &Y ExEBawBg WhG-F&Bj
qozfacYGEhy:gguDuaT
,lrcAeCkRgK;jhtMY-woYLfZKF QJpnyx,Ayccc3ziKIG,LusaA?UTENTgS'xDFi-LtA;ujVb;Mx'TWEcCVyXrp$JB-,VdQ
BIASpwDXVNpQyHJdxSka
,eTigQbDHooKIyQ3wfJR,DlGobLY,d;b;&
de:xDmdWu;eLSWSEGx;V&X'ioJW!,yH:PJBRK,ZizhX;E.Ul!nIAO,,$3:
nzTDS;WSQPRez?ErXpO&R?A$&XNz;eil?EFQSuahbndyU-Zh,RDN&bWIiv:rpBafjCQJZ'oKBRh,bCo!vKfIib.Law&uvmfnozHmcYVCG&Q UBoHuN
rPDuCiso!U&V;pSzfZylVf.LeEVQbCBgESuxrY;REeY$L,?CLNG3kVtFkFMgB pplQ&IfSZd:Ama'
