In [2]:
from importlib.metadata import version

import tiktoken
import torch

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.2.0
tiktoken version: 0.5.2


In [10]:
with open("instructions.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text),'\n')
print(raw_text[:1000])

Total number of character: 339472 

question How many heads of the departments are older than 56 ?<|endoftext|>
query SELECT count(*) FROM head WHERE age  >  56<|endoftext|>

question List the creation year, name and budget of each department.<|endoftext|>
query SELECT creation ,  name ,  budget_in_billions FROM department<|endoftext|>

question What are the maximum and minimum budget of the departments?<|endoftext|>
query SELECT max(budget_in_billions) ,  min(budget_in_billions) FROM department<|endoftext|>

question What is the average number of employees of the departments whose rank is between 10 and 15?<|endoftext|>
query SELECT avg(num_employees) FROM department WHERE ranking BETWEEN 10 AND 15<|endoftext|>

question What are the names of the heads who are born outside the California state?<|endoftext|>
query SELECT name FROM head WHERE born_state != 'California'<|endoftext|>

question How many acting statuses are there?<|endoftext|>
query SELECT count(DISTINCT temporary_acting) F

In [11]:
tokenizer = tiktoken.get_encoding("gpt2")

In [13]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

strings = tokenizer.decode(integers)

print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


In [16]:
tokenizer.encode('<|endoftext|>',allowed_special={"<|endoftext|>"})

[50256]

In [22]:
enc_text = tokenizer.encode(raw_text,allowed_special={"<|endoftext|>"})
print(len(enc_text))




70811


In [28]:
enc_sample = enc_text[50:]

' name ,  budget_in_billions FROM department<|endoftext|>\n\nquestion What are the maximum and minimum budget of the departments?<|endoftext|>\nquery SELECT max(budget_in_billions) ,  min(budget_in_billions) FROM department<|endoftext|>\n\nquestion What is the average number of employees of the departments whose rank is between 10 and 15?<|endoftext|>\nquery SELECT avg(num_employees) FROM department WHERE ranking BETWEEN 10 AND 15<|endoftext|>\n\nquestion What are the names of the heads who are born outside the California state?<|endoftext|>\nquery SELECT name FROM head WHERE born_state != \'California\'<|endoftext|>\n\nquestion How many acting statuses are there?<|endoftext|>\nquery SELECT count(DISTINCT temporary_acting) FROM management<|endoftext|>\n\nquestion How many farms are there?<|endoftext|>\nquery SELECT count(*) FROM farm<|endoftext|>\n\nquestion Count the number of farms.<|endoftext|>\nquery SELECT count(*) FROM farm<|endoftext|>\n\nquestion List the total number of horses

In [57]:
context_size = 10
for i in range(context_size-1):
    X = enc_sample[:i+1]
    y = enc_sample[i+1:context_size]

    print(f"""label: {X}   target: {y}\n""")

label: [1438]   target: [837, 220, 4466, 62, 259, 62, 35546, 507, 16034]

label: [1438, 837]   target: [220, 4466, 62, 259, 62, 35546, 507, 16034]

label: [1438, 837, 220]   target: [4466, 62, 259, 62, 35546, 507, 16034]

label: [1438, 837, 220, 4466]   target: [62, 259, 62, 35546, 507, 16034]

label: [1438, 837, 220, 4466, 62]   target: [259, 62, 35546, 507, 16034]

label: [1438, 837, 220, 4466, 62, 259]   target: [62, 35546, 507, 16034]

label: [1438, 837, 220, 4466, 62, 259, 62]   target: [35546, 507, 16034]

label: [1438, 837, 220, 4466, 62, 259, 62, 35546]   target: [507, 16034]

label: [1438, 837, 220, 4466, 62, 259, 62, 35546, 507]   target: [16034]



In [58]:
context_size = 10
for i in range(context_size-1):
    X = enc_sample[:i+1]
    y = enc_sample[i+1:context_size]

    X,y = tokenizer.decode(X),tokenizer.decode(y)

    print(f"""label: {X}   target: {y}\n""")

label:  name   target:  ,  budget_in_billions FROM

label:  name ,   target:   budget_in_billions FROM

label:  name ,    target:  budget_in_billions FROM

label:  name ,  budget   target: _in_billions FROM

label:  name ,  budget_   target: in_billions FROM

label:  name ,  budget_in   target: _billions FROM

label:  name ,  budget_in_   target: billions FROM

label:  name ,  budget_in_bill   target: ions FROM

label:  name ,  budget_in_billions   target:  FROM



In [59]:
class GPTDatasetV1():
    def __init__(self,txt, tokenizer, context_size=4, stride=1):
        self.tokenizer = tokenizer
        self.context_size = context_size
        self.stride = stride    
        self.enc_text = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
        self.input_ids = []
        self.target_ids = []
    
        for i in range(0, len(enc_text)-context_size, stride): 
            input_chunk = enc_text[i:i+context_size]
            target_chunk = enc_text[i+1:i+context_size+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        
        def __len__(self):
            return len(self.input_ids)
        def __getitem__(self, idx):
            return self.input_ids[idx], self.target_ids[idx]
        

[37315, 62, 259, 62, 35546, 507]

In [63]:
def create_dataloader_v1(txt, context_size=4, stride=1, batch_size=4, shuffle=True):

    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, context_size, stride)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [72]:
dataloader = create_dataloader_v1(raw_text, context_size=4, stride=1, batch_size=1)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

second_batch = next(data_iter)

print(second_batch)

third_batch = next(data_iter)
print(third_batch)

[tensor([[ 3670, 16034,  1781, 38678]]), tensor([[16034,  1781, 38678, 11050]])]
[tensor([[8808, 1268, 4177, 4508]]), tensor([[1268, 4177, 4508,    8]])]
[tensor([[16034,  7623,    62, 19966]]), tensor([[ 7623,    62, 19966, 38678]])]


In [74]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, context_size=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[25652,  1374,   867,  6665],
        [  286,   262, 13346,   389],
        [ 4697,   621,  7265,  5633],
        [50256,   198, 22766, 33493],
        [  954,     7, 28104, 16034],
        [ 1182, 33411,  2479,   220],
        [ 1875,   220,  7265, 50256],
        [  198,   198, 25652,  7343]])

Targets:
 tensor([[ 1374,   867,  6665,   286],
        [  262, 13346,   389,  4697],
        [  621,  7265,  5633, 50256],
        [  198, 22766, 33493,   954],
        [    7, 28104, 16034,  1182],
        [33411,  2479,   220,  1875],
        [  220,  7265, 50256,   198],
        [  198, 25652,  7343,   262]])


In [75]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [76]:
query = inputs[1]  # 2nd input token is the query

attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query) # dot product (transpose not necessary here since they are 1-dim vectors)

print(attn_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [77]:
res = 0.

for idx, element in enumerate(inputs[0]):
    res += inputs[0][idx] * query[idx]

print(res)
print(torch.dot(inputs[0], query))

tensor(0.9544)
tensor(0.9544)


In [78]:
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()

print("Attention weights:", attn_weights_2_tmp)
print("Sum:", attn_weights_2_tmp.sum())

Attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum: tensor(1.0000)


In [79]:
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

attn_weights_2_naive = softmax_naive(attn_scores_2)

print("Attention weights:", attn_weights_2_naive)
print("Sum:", attn_weights_2_naive.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("instructions.txt", "r", encoding="utf-8") as f:
    text = f.read()


# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
## 
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


0.212567 M parameters
step 0: train loss 4.6211, val loss 4.6227
step 100: train loss 2.0950, val loss 2.0942
step 200: train loss 1.7111, val loss 1.7195
step 300: train loss 1.5817, val loss 1.5671
step 400: train loss 1.4311, val loss 1.4423
step 500: train loss 1.3734, val loss 1.3407
step 600: train loss 1.2677, val loss 1.2823
step 700: train loss 1.2093, val loss 1.2012
step 800: train loss 1.1486, val loss 1.2007
step 900: train loss 1.0938, val loss 1.1331
step 1000: train loss 1.0934, val loss 1.1023
step 1100: train loss 1.0588, val loss 1.0760
step 1200: train loss 1.0212, val loss 1.0532
step 1300: train loss 0.9774, val loss 1.0253
step 1400: train loss 0.9494, val loss 1.0266
step 1500: train loss 0.9551, val loss 1.0206
step 1600: train loss 0.9254, val loss 0.9835
step 1700: train loss 0.9036, val loss 0.9845
step 1800: train loss 0.8808, val loss 0.9810
step 1900: train loss 0.8804, val loss 0.9620
step 2000: train loss 0.8720, val loss 0.9420
step 2100: train loss 0.

In [2]:
print(decode(m.generate(context, max_new_tokens=3000)[0].tolist()))

	
query ORDEC<|endoftext|>

question What is the different puroded ind?<|endoftext|>
query SELECT distinct(FROM manufaction WHERE diffferent  =  "LAtalony';<|endoftext|>

question What iis the list olE oplergans oppose fed inves in?<|endoftext|>
query SELECT count(*) FROM sports WHERE Top Lonember  FROM county_rel_Trange WHERE description  =  'Apartment'<|endoftext|>

question What is the name of all prices ASC<|endoftext|>

question What is the unitme advenember of dop artistst with code state longs are ord coblege_in_s_zating ,  memployee_number FROM PLOS;<|endoftext|>

question Which apt_name FROM members<|endoftext|>

question What is the empled whose contery documents codes aull costricrs are there in affemal employee a dores location types.<|endoftext|>
query SELECT amtinit_eurlegion FROM moust_beth smoname GROUP BY cate<|endoftext|>

question How many dormstom crezatiot joined descent are 100 averale outhome averagest per held than in of each veripts with mouthors in alphabetica

In [10]:
torch.save(model, "my_model.pt") 