In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


--2024-01-22 11:35:18--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8000::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.6’


2024-01-22 11:35:18 (29.5 MB/s) - ‘input.txt.6’ saved [1115394/1115394]



In [2]:
import torch
from torch import nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
print(f"Default device set to --> {device}")

Default device set to --> cuda


In [3]:
with open("input.txt", 'r', encoding="UTF-8") as f:
    text = f.read()
    chars = sorted(set(text))
    vocab_size = len(chars)

print(chars)
print(vocab_size)

stoi = {char:integer for integer, char in enumerate(chars)}
itos = {integer:char for integer, char in enumerate(chars)}

encode = lambda enc: [stoi[c] for c in enc]
decode = lambda dec: "".join([itos[i] for i in dec])


['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '/', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
66


In [4]:
train_percent = 0.9
eval_percent = 0.1


data = torch.tensor(encode(text))

train_split = int(len(data)*train_percent)

train_data = data[:train_split]
eval_data = data[train_split:]




In [5]:
context_size = 8
x = train_data[:context_size]
y = train_data[1:context_size+1]

#Doing it whis way is good so the transformer then know what to do from context being 1 up to context_size
for t in range(context_size):
    print(f"For input {x[:t+1]} expected --> {y[t]}")

For input tensor([51], device='cuda:0') expected --> 51
For input tensor([51, 51], device='cuda:0') expected --> 52
For input tensor([51, 51, 52], device='cuda:0') expected --> 15
For input tensor([51, 51, 52, 15], device='cuda:0') expected --> 40
For input tensor([51, 51, 52, 15, 40], device='cuda:0') expected --> 58
For input tensor([51, 51, 52, 15, 40, 58], device='cuda:0') expected --> 48
For input tensor([51, 51, 52, 15, 40, 58, 48], device='cuda:0') expected --> 42
For input tensor([51, 51, 52, 15, 40, 58, 48, 42], device='cuda:0') expected --> 58


In [6]:
def get_batch(data, batch_size, context_size):
    batch_ix = torch.randint(0, len(data)-context_size, (batch_size,))
    inputs = torch.stack([data[ix:ix+context_size]for ix in batch_ix])
    labels = torch.stack([data[ix+1:ix+context_size+1] for ix in batch_ix])

    return (inputs, labels)


batch_size = 2
context_size = 8
xb, yb = get_batch(train_data, batch_size, context_size)
print(xb) # batch_size X context_size
print(yb) # batch_size X context_size

for btch in range(batch_size): #Batch dimension (B)
    for t in range(context_size): #Time dimension (T)
        context = xb[btch, :t+1]
        output = yb[btch, t] #the labels tensor is already offseted in get_batch
        print(f"Context: {context} expected output --> {output}")

tensor([[ 1, 51, 54, 61, 44,  1, 52, 44],
        [59, 47, 48, 58,  1, 41, 54, 43]], device='cuda:0')
tensor([[51, 54, 61, 44,  1, 52, 44,  6],
        [47, 48, 58,  1, 41, 54, 43, 64]], device='cuda:0')
Context: tensor([1], device='cuda:0') expected output --> 51
Context: tensor([ 1, 51], device='cuda:0') expected output --> 54
Context: tensor([ 1, 51, 54], device='cuda:0') expected output --> 61
Context: tensor([ 1, 51, 54, 61], device='cuda:0') expected output --> 44
Context: tensor([ 1, 51, 54, 61, 44], device='cuda:0') expected output --> 1
Context: tensor([ 1, 51, 54, 61, 44,  1], device='cuda:0') expected output --> 52
Context: tensor([ 1, 51, 54, 61, 44,  1, 52], device='cuda:0') expected output --> 44
Context: tensor([ 1, 51, 54, 61, 44,  1, 52, 44], device='cuda:0') expected output --> 6
Context: tensor([59], device='cuda:0') expected output --> 47
Context: tensor([59, 47], device='cuda:0') expected output --> 48
Context: tensor([59, 47, 48], device='cuda:0') expected output 

In [29]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd):
        super().__init__()

        self.vocab_size = vocab_size

        self.token_emb_table = nn.Embedding(vocab_size, n_embd) # vocab_size X n_crammed_dimension (in this case 1 dimension per character, so it's 1:1)
        #when taking the emb with some context the returned shape is n_inputs X context_size X n_embd
        
        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size)

    def forward(self, inputs, labels=None):
        embedded = self.token_emb_table(inputs)
        #print(embedded)
        #print(embedded.shape) #if batched, the shape is batch_size X context_size X n_embd (B, T, n_embd)
        #each character has n_embd associated values
        #each context has context_size characters
        #each batch has 4 inputs total

        logits = self.lm_head(embedded) # batch_size X context_size X vocab_size (B, T, vocab_size)

        #print(logits)
        #print(logits.shape) # Batches (inputs) X Context Size X vocab_size (n_crammed_dimension in this case) Very simillar to the mlp model
        #logits = logits.view(-1, self.vocab_size)
        #logits = logits.view(self.vocab_size, -1)

        if labels is not None:
            B, T, C = logits.shape #Batch by time by channel
            #For cross entropy, pytorch expects C (channels) to be the second dimension
            logits = logits.view(B*T, C)
            labels = labels.view(-1) #-1 will turn the shape from batch_size X context_size to batch_size * context_size
            #print(logits.shape) # Batches (inputs) * Context Size X vocab_size
            #print(labels.shape)
            loss = F.cross_entropy(logits, labels) #It will compare the highest value in a logits batch to the value expected in the labels and determine a loss using negative log likelihood
            return logits, loss
        else:
            return logits
        

    def generate(self, idx, max_new_tokens):
        #idx is B X T Because it's going to pluck out a number in the embedding table
        #B --> batch
        #T --> context
        #or number of inputs X characters (context)
        for i in range(max_new_tokens):
            logits = self(idx) #B X T X C
            logits = logits[:, -1, :] #Pluck the last character in sequence generated which is actually the new character predict (this turns the logits shape into B X C)
            #Other way of interpreting it --> for every batch, get the last context value and it's channels
            #print(logits)

            probs = F.softmax(logits, dim=1)

            sample = torch.multinomial(probs, num_samples=1) #B X num_samples
            idx = torch.cat((idx, sample), dim=1)
            #print(idx)

            #print(logits.shape)
        return idx

In [30]:
batch_size = 4
context_size = 8
model = BigramLanguageModel(vocab_size=vocab_size, n_embd=32)
x, y = get_batch(batch_size=batch_size, context_size=context_size, data=train_data)
model(x)

tensor([[[ 0.4075,  0.4943,  0.1116,  ...,  0.9779,  0.1245, -0.2758],
         [ 0.2015,  0.4193, -0.0295,  ..., -0.3305, -0.4139,  0.7940],
         [-0.8755, -1.0539,  0.3203,  ..., -0.5948,  0.4346, -1.4882],
         ...,
         [ 0.3956,  0.9902, -1.0341,  ..., -1.2184,  0.5493,  0.7162],
         [-0.3000, -0.6724,  0.4750,  ...,  0.7379,  0.3419,  0.9488],
         [ 0.2564, -0.2031, -0.3306,  ...,  0.2243,  0.7053, -0.0562]],

        [[-0.3338,  0.4826, -0.5383,  ..., -1.9343,  0.4001,  0.5283],
         [ 0.3143, -0.2986, -0.3064,  ...,  0.1558, -0.0080,  0.1972],
         [ 0.2015,  0.4193, -0.0295,  ..., -0.3305, -0.4139,  0.7940],
         ...,
         [-0.2499,  0.6423,  0.2504,  ..., -0.0449, -0.1230,  0.8345],
         [-0.2499,  0.6423,  0.2504,  ..., -0.0449, -0.1230,  0.8345],
         [ 0.4075,  0.4943,  0.1116,  ...,  0.9779,  0.1245, -0.2758]],

        [[ 0.3143, -0.2986, -0.3064,  ...,  0.1558, -0.0080,  0.1972],
         [ 0.2015,  0.4193, -0.0295,  ..., -0

In [31]:
xb, yb = get_batch(train_data, batch_size, context_size)
logits, loss = model(xb, yb)
print(torch.zeros((1, 1)))
print(torch.tensor([[0]]))
sample = model.generate(torch.zeros((1, 1), dtype=torch.long), 500) #Sample from the starting character [0]
print(decode(sample[0].tolist()))


tensor([[0.]], device='cuda:0')
tensor([[0]], device='cuda:0')

f;LOD:.!PWNrBs
$y/HSc;:AhOORP;yOoTVRIs'uIsDm-3:$He'I/:YorZ-uY.bbUan;M,O&CfIiNZa/b
ZyVFF,hrz
kO&kYDYUoo$N/J3Ruz3sj:HIJN:RmRH:DbMAYMBxy/FG/tyBPX
Xzpnv&W;LVeXREczd,SD:kFULnBKP;
v-,V-Vtv
 vlaDwbtkmsWbH-UKzI?-A!pBBBXclPGwaHxPXcozg.KbqcD3yFgnn$/DlCdJ$E
b
Wn:PnrjvPYHv'BIH EP/SORFZG$sM'OdlKvZjuzoMu-pSByIc:P$jnYvUZ,IgiC/$Ri
dKFwfDUkPYzikUfDxtnBL3/oavOKFSXV-KD/WZE
kjctTdWUNn!cBt!EuR:rOKD:-lweEVQHfFjZxvGOqY3Sie xqKjLqlM,:PCCxZjiSDdJUtVRDdPnBiMIVm!M'?Txwc:CTjzME mw;nyMPeHb-jikeRyelmoJpyvT!-QYr;c
HAfDshkdq.p


In [32]:
from tqdm import tqdm

In [33]:

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

epochs = 10000
batch_size = 32
context_size = 8

for epoch in tqdm(range(epochs)):
    model.train()
    xb, yb = get_batch(data=train_data, batch_size=batch_size, context_size=context_size)
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

model.eval()
print(loss)


100%|██████████| 10000/10000 [01:03<00:00, 156.97it/s]

tensor(2.3898, device='cuda:0', grad_fn=<NllLossBackward0>)





In [None]:
sample = model.generate(torch.zeros((1, 1), dtype=torch.long), 50) #Sample from the starting character [0]
print(decode(sample[0].tolist()))


To tamy t out g'sshang tais gabener tsen ast be pe
