In [1]:
with open('shakespare.txt', 'r', encoding="utf-8") as f:
    text = f.read()

In [2]:
print("len of text: ", len(text))

len of text:  5447744


In [3]:
print(text[:1050])


1609

THE SONNETS

by William Shakespeare



                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy 

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("chars: ", ''.join(chars))
print("vocab_size: ", vocab_size)

chars:  
 !"&'(),-.0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz|}
vocab_size:  84


In [5]:
do_i = { char:i for i,char in enumerate(chars)}
undo_i = {i:char for i,char in enumerate(chars)}

encode = lambda s: [do_i[c] for c in s]
decode = lambda l: ''.join([undo_i[ix] for ix in l])

In [6]:
print(encode("Hey, It is Timi"))
print(decode(encode("Hey, It is Timi")))

[33, 60, 80, 8, 1, 34, 75, 1, 64, 74, 1, 45, 64, 68, 64]
Hey, It is Timi


In [8]:
import torch

all_data = torch.tensor(encode(text), dtype=torch.long)
print(all_data.shape, all_data.dtype) 


torch.Size([5447744]) torch.int64


In [9]:
print(all_data[:100])

tensor([ 0, 12, 17, 11, 20,  0,  0, 45, 33, 30,  1, 44, 40, 39, 39, 30, 45, 44,
         0,  0, 57, 80,  1, 48, 64, 67, 67, 64, 56, 68,  1, 44, 63, 56, 66, 60,
        74, 71, 60, 56, 73, 60,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 12,  0,  1,  1, 31,
        73, 70, 68,  1, 61, 56, 64, 73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76,
        73, 60, 74,  1, 78, 60,  1, 59, 60, 74])


In [10]:
n = int(0.9 * len(all_data))
train_data = all_data[:n]
test_data = all_data[n:]

In [11]:
block_size = 8
train_data[:block_size+1]

tensor([ 0, 12, 17, 11, 20,  0,  0, 45, 33])

In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]

    print(f"when input is {context}, output is {target}")



when input is tensor([0]), output is 12
when input is tensor([ 0, 12]), output is 17
when input is tensor([ 0, 12, 17]), output is 11
when input is tensor([ 0, 12, 17, 11]), output is 20
when input is tensor([ 0, 12, 17, 11, 20]), output is 0
when input is tensor([ 0, 12, 17, 11, 20,  0]), output is 0
when input is tensor([ 0, 12, 17, 11, 20,  0,  0]), output is 45
when input is tensor([ 0, 12, 17, 11, 20,  0,  0, 45]), output is 33


In [15]:
torch.manual_seed(-1)

batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == "train" else test_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x,y

xb, yb = get_batch("train")

print("inputs",xb.shape,xb)
print("outputs",yb.shape,yb)

tensor([2574686, 3759761, 1188637, 4492827])
inputs torch.Size([4, 8]) tensor([[75, 74,  1, 59, 70, 78, 69, 10],
        [69, 10,  0,  0,  1,  1,  1,  1],
        [68, 71, 67, 70, 80, 68, 60, 69],
        [ 1, 28, 40, 41, 34, 30, 44,  1]])
outputs torch.Size([4, 8]) tensor([[74,  1, 59, 70, 78, 69, 10,  0],
        [10,  0,  0,  1,  1,  1,  1,  1],
        [71, 67, 70, 80, 68, 60, 69, 75],
        [28, 40, 41, 34, 30, 44,  1,  6]])


In [18]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b, t]

        print(f"when input is {context.tolist()}, the target is {target}")

when input is [75], the target is 74
when input is [75, 74], the target is 1
when input is [75, 74, 1], the target is 59
when input is [75, 74, 1, 59], the target is 70
when input is [75, 74, 1, 59, 70], the target is 78
when input is [75, 74, 1, 59, 70, 78], the target is 69
when input is [75, 74, 1, 59, 70, 78, 69], the target is 10
when input is [75, 74, 1, 59, 70, 78, 69, 10], the target is 0
when input is [69], the target is 10
when input is [69, 10], the target is 0
when input is [69, 10, 0], the target is 0
when input is [69, 10, 0, 0], the target is 1
when input is [69, 10, 0, 0, 1], the target is 1
when input is [69, 10, 0, 0, 1, 1], the target is 1
when input is [69, 10, 0, 0, 1, 1, 1], the target is 1
when input is [69, 10, 0, 0, 1, 1, 1, 1], the target is 1
when input is [68], the target is 71
when input is [68, 71], the target is 67
when input is [68, 71, 67], the target is 70
when input is [68, 71, 67, 70], the target is 80
when input is [68, 71, 67, 70, 80], the target i

In [24]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(-1)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets):
        logits = self.token_embedding_table(idx) # B T C


        B, T, C = logits.shape
        logits = logits.view(B*T,C)
        targets = targets.view(B*T)
        
        loss = F.cross_entropy(logits, targets)
        return logits,loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits,loss = self(idx)

            logits = logits[:, -1, :]
            probs =  F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            idx = torch.cat((idx, idx_next), dim =1)
        return idx



m = BigramLanguageModel(vocab_size)
logits,loss = m(xb,yb)
print(logits.shape)

print(loss)


torch.Size([32, 84])
tensor(4.9646, grad_fn=<NllLossBackward0>)
