# Karpathy Guide to Build GPT from Scratch

The entire tutorial is here https://www.youtube.com/watch?v=kCc8FmEb1nY

In [1]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-11-06 22:45:53--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8002::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-11-06 22:45:53 (2.61 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
# open the shakespeare data file
with open('input.txt', 'r') as f:
    text = f.read()

In [3]:
print(f'Length of characters in the text {len(text)}')

Length of characters in the text 1115394


In [4]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
# making a set of the characters and then creating the entire character level vocab for this text
characters = sorted(list(set(text)))
vocab_size = len(characters)
print(''.join(characters))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
stoi = {ch:i for i,ch in enumerate(characters)}
itos = {i:ch for i,ch in enumerate(characters)}

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of numbers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [7]:
print(encode("hii there"))
print(decode([23,3,4,2,3,1,23,36,23,1,54,54,8]))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
K$&!$ KXK pp.


## Tokenize with Character Level Tokenization the entire Shakespeare Text

In [8]:
import torch
data = torch.tensor(encode(text))
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
# split data into train and val
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
# we create a block or a "context" that the model sees when training
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


A block size of 8 essentially creates 8 training examples that are used for the transformers to train on. We do training on everything from len 1 to len block_size so that the model sees a variety of texts during training

Now we are going to batch this and create our training batches. Since the GPU is really good at parallel processing, we can feed it multiple chunks from the same batch and it can train on them together. These chunks in the same batch are all independent of each other and they are just being processed at the same time, but they do not affect each other in any way.

This is where batch size comes in. Batch size is the amount of chunks we are processing at once

In [12]:
torch.manual_seed(1337)
batch_size = 4 #how many independent sequences will we process in parallel
block_size = 8 # this is the maximum context length for predictions

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) # becomes a tensor stack of shape 4,8
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): #batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

In [13]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


## Creating a simple Bigram Language Model (check out Makemore series)

In [27]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(-1)
        loss = F.cross_entropy(logits, targets)

        return logits, loss
        
    def generate(self, idx, 

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [20]:
torch.argmax(out[1,1,:])

tensor(47)

In [22]:
torch.max(out[1,1])

tensor(3.1170, grad_fn=<MaxBackward1>)

In [23]:
out[1,1]

tensor([-0.1324, -0.5489,  0.1024, -0.6916,  0.3507,  1.6147,  1.8203,  0.5122,
         1.5810, -2.0063, -1.2925,  0.1268,  1.1099, -0.6592,  0.8084,  1.9072,
        -0.3260, -0.3438, -1.4415, -0.1828, -0.8804, -0.6192, -1.4047, -0.8584,
        -0.3830, -0.5372, -1.2176, -1.9403, -0.3094,  0.1790,  1.2859,  0.3039,
         1.8110,  0.6350, -0.0820, -2.1208,  1.2516, -0.6826,  0.3838,  0.0150,
        -0.2801,  1.4896, -0.4646, -1.9210, -0.1062,  1.0614,  0.9308,  3.1170,
        -1.5428, -2.2848,  0.5755, -0.8040,  0.8010,  0.0088, -0.4751, -0.9630,
        -0.5078,  0.1018,  1.9141, -1.9252, -1.5554, -0.1878, -0.8599, -1.6050,
        -0.6985], grad_fn=<SelectBackward0>)