In [2]:
#download the dataset
!curl -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  6328k      0 --:--:-- --:--:-- --:--:-- 6332k


In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Length of text: {len(text)} characters")
print(text[:500])  # print the first 500 characters

Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(f"Vocabulary size: {vocab_size} unique characters")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size: 65 unique characters


In [5]:
#encoder and decoder
stoi = { ch: i for i, ch in enumerate(chars) }
itos = { i: ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] #take a string -> output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) #take a list of integers -> output a string

print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


In [6]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long) #convert to tensor
print(data.shape, data.dtype)
print(data[:100]) 

n = int(0.9 * len(data)) #90% for training, 10% for validation|
train_data = data[:n]
val_data = data[n:]

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [7]:
torch.manual_seed(1337)  # for reproducibility
block_size = 8 # maximum context size
batch_size = 4 # how many sequences to process at in parallel

def get_batch(split):
    # generate a small, random batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y


xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [8]:
#let's implement bigram language model 

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)  # for reproducibility

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        # Row i (of the embedding table) = logits for the next token given current token i
        # It’s directly acting as a conditional probability table for bigrams
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)

        if targets is None:
            loss = None
        else:    
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            last = idx[:, -1]                           # (B,)
            logits = self.token_embedding_table(last)   # (B, C)
            probs = F.softmax(logits, dim=-1)           # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)     # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)

print(logits.shape)
print(logits)
print(loss)

torch.Size([4, 8, 65])
tensor([[[-1.5101, -0.0948,  1.0927,  ..., -0.6126, -0.6597,  0.7624],
         [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
         [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
         ...,
         [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
         [ 1.0901,  0.2170, -2.9996,  ..., -0.5472, -0.8017,  0.7761],
         [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594]],

        [[ 1.0541,  1.5018, -0.5266,  ...,  1.8574,  1.5249,  1.3035],
         [-0.1324, -0.5489,  0.1024,  ..., -0.8599, -1.6050, -0.6985],
         [-0.6722,  0.2322, -0.1632,  ...,  0.1390,  0.7560,  0.4296],
         ...,
         [ 1.0901,  0.2170, -2.9996,  ..., -0.5472, -0.8017,  0.7761],
         [ 1.1513,  1.0539,  3.4105,  ..., -0.5686,  0.9079, -0.1701],
         [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305]],

        [[-0.2103,  0.4481,  1.2381,  ...,  1.3597, -0.0821,  0.3909],
         [ 0.2475, -0.

In [9]:

idx = torch.zeros((1, 1), dtype=torch.long) # starting context
print(decode(m.generate(idx, max_new_tokens=200)[0].tolist()))


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJt-wBpm&yiltNCjeO3:Cx&vvMYW-txjuAd IRFbTpJ$zkZelxZtTlHNzdXXUiQQY:qFINTOBNLI,&oTigq z.c:Cq,SDXzetn3XVj


In [10]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [11]:
batch_size = 32
for steps in range(100000):
    
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)


    optimizer.zero_grad(set_to_none=True)
    loss.backward()                     # autograd: fills .grad for all params used
    optimizer.step()                    # updates params

    print(loss.item())

4.729564189910889
4.7325520515441895
4.741172790527344
4.731782913208008
4.799441337585449
4.7117085456848145
4.700162410736084
4.729720115661621
4.804632663726807
4.670468807220459
4.759134292602539
4.699430465698242
4.635226726531982
4.641706943511963
4.680007457733154
4.8185272216796875
4.777040004730225
4.717551231384277
4.728647232055664
4.696633815765381
4.736943244934082
4.778837203979492
4.6513800621032715
4.727667331695557
4.662977695465088
4.654192924499512
4.759261131286621
4.799250602722168
4.595513343811035
4.763099670410156
4.668941974639893
4.65053653717041
4.655548095703125
4.72171688079834
4.756824016571045
4.688776969909668
4.750080585479736
4.712107181549072
4.678682327270508
4.708428859710693
4.588769435882568
4.631885051727295
4.737229347229004
4.574954509735107
4.689253330230713
4.634051322937012
4.791592597961426
4.597540855407715
4.669371128082275
4.743669509887695
4.675384998321533
4.680590629577637
4.63499641418457
4.7282328605651855
4.650218486785889
4.715626

In [12]:
idx = torch.zeros((1, 1), dtype=torch.long) # starting context
print(decode(m.generate(idx, max_new_tokens=200)[0].tolist()))


Fomourmenghau buaror VOubed spo mng as chathab llll:
Ware,

ee her,
Thooured aly y hindr's.
Fashat--
MNGes s, share hathure Anfaneof f s llon!

ICLiangshange

Then
Magend cugss, be jollrty

AROUFLom, 


In [13]:
# trick in self-attention

torch.manual_seed(1337) 
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C)
x.shape


torch.Size([4, 8, 2])

In [14]:
# We want x[b, t] = mean_{k <= t} x[b, k]
 
xbow = torch.zeros((B, T, C)) #bag of words over time
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)
        
xbow

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [15]:
wei = torch.tril(torch.ones(T, T))
wei = wei / torch.sum(wei, 1, keepdim=True) # normalize the rows

xbow2 = wei @ x # (B, T, T) @ (B, T, C) -> (B, T, C)

torch.norm(xbow2 - xbow)

tensor(2.1087e-07)

In [16]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True) # normalize the rows
b = torch.randint(0, 10, (3, 2)).float()

c = a @ b
print('a = ', a)
print('b = ', b)
print('c = ', c)

a =  tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b =  tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c =  tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [17]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

xbow3 = wei @ x
torch.norm(xbow3 - xbow)

tensor(2.1087e-07)

In [None]:
# self-attention (single head)
torch.manual_seed(1337)  # for reproducibility
B, T, C = 4, 8, 32 
x = torch.randn(B, T, C)


# a single head of self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)   # (B, T, head_size)
q = query(x) # (B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5 # --> (B, T, T)

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)


v = value(x) # (B, T, head_size)
out = wei @ v

out.shape 

torch.Size([4, 8, 32])