In [1]:
"""
Prepare the Shakespeare dataset for character-level language modeling.
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
encoder and decoder and some other related info.
"""
import os
import numpy as np
from pathlib import Path
import torch
base_dir = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()
print(base_dir)

with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print("length of dataset in characters: ", len(text))

chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocab size is ", vocab_size)
print(''.join(chars))

stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
print(encode("hello world"))
print(decode(encode("hello world")))

data = torch.tensor(encode(text), dtype=torch.long) 
print(data[:10])
# length of dataset in characters:  1115394
# all the unique characters:
#  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
# vocab size: 65
# train has 1003854 tokens
# val has 111540 tokens


/home/ruochen/projects/nanoGPT/data/shakespeare_char
length of dataset in characters:  1115394
Vocab size is  65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


In [2]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
print(train_data[:10])
print(val_data[:10])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])
tensor([12,  0,  0, 19, 30, 17, 25, 21, 27, 10])


In [3]:
# This is what the max context window that the model can see.
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [4]:
# [1,2,3,4,5,6,7,8,9,10...]
# [0:8]
x = train_data[:block_size]
# [1:9]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [None]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    # ix is going to be a four random integers
    # for example, [ 76049, 234249, 934904, 560986]
    ix = torch.randint(len(data) - block_size, (batch_size,) )
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(xb.shape)
print(xb)
print(yb.shape)
print(yb)



torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        """
        With nn.Embedding:
        self.token_embedding_table = nn.Embedding(65, 65)
        PyTorch automatically knows this is a learnable parameter!
        """
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        """
        logits = self.token_embedding_table(idx)  # idx is [4, 8]
        ```

        What PyTorch does:
        1. Takes each token ID in your `[4, 8]` input
        2. Looks up that token's row in the embedding table
        3. Returns all 65 scores from that row

        **Concrete example**: Let's trace position [0, 0] (first sequence, first token):
        - `xb[0, 0]` might be token `23`
        - Look up row 23 in the embedding table
        - Get back 65 numbers (the scores for what comes after token 23)
        - Store this at `logits[0, 0, :]` which has size 65

        This happens **independently** for all 4×8 = 32 positions!

        ### Step 4: The Output Shape
        ```
        Input:  [4, 8]       → 4 sequences × 8 tokens each
                            ↓ lookup each token
        Output: [4, 8, 65]   → 4 sequences × 8 positions × 65 scores for next token
        ```

        ## Intuitive Visualization

        Imagine you have the sentence "Hello my name is":
        ```
        Position 0: "Hello"  → Get 65 scores for what comes after "Hello"
        Position 1: "my"     → Get 65 scores for what comes after "my"  
        Position 2: "name"   → Get 65 scores for what comes after "name"
        Position 3: "is"     → Get 65 scores for what comes after "is"
        """
        logits = self.token_embedding_table(idx)  # [batch_size, block_size, vocab_size]
        # Use negative likelihood loss. 
        loss = F.cross_entropy()

        return logits
    
m = BigramLanguageModel(vocab_size)
out = m(xb, yb)
print(out.shape)

torch.Size([4, 8, 65])
