In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
import hashlib
import os
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [3]:
print(text[:10])

First Citi


In [4]:
blobpath = "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe"
cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
tiktoken_cache_dir = "./tiktoken_cache"
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
assert os.path.exists(os.path.join(tiktoken_cache_dir, cache_key))

# Now you can use tiktoken
enc = tiktoken.get_encoding("gpt2")
print(f"The size of the vocabulary is {enc.n_vocab}")

The size of the vocabulary is 50257


In [5]:
encode = enc.encode
def decode(input_data):
    # Check if input is a PyTorch tensor
    if isinstance(input_data, torch.Tensor):
        input_data = input_data.tolist()
        print(input_data)
    return enc.decode(input_data)
print(encode("hii there"))
print(decode(encode("hii there")))

[71, 4178, 612]
hii there


In [6]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:10]) 

torch.Size([338025]) torch.int64
tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11])


In [7]:
class SequenceDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.block_size]
        y = self.data[idx+1:idx+self.block_size+1]
        return x, y

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
block_size = 8
batch_size = 64
# Create datasets
train_dataset = SequenceDataset(train_data, block_size)
val_dataset = SequenceDataset(val_data, block_size)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Usage example
for xb, yb in train_loader:
    print('inputs:', xb.shape, xb)
    print('targets:', yb.shape, yb)
    break

inputs: torch.Size([64, 8]) tensor([[ 1225,   713,   290, 32363,   257,   582,   338,    30],
        [17903,   422, 11906,  6002,  1272,    11,   198,    39],
        [  534, 33558,   284,   262,   640,    13,   198,  7120],
        [ 2538, 35830,  1546,    25,   198,   198,    44,  2390],
        [  300,  6315,    25,   198,    44,  6532,    11,   467],
        [ 3963, 14545,  4944,    51,    25,   198,  2514,   307],
        [  502,    11,   290,   777, 11906,  9730,    11,   198],
        [ 2767,    25,   198,    46,  1793,     0,   750, 43989],
        [19337,    11,  5609,  2415,     0,   198,  2437,   783],
        [ 2937,    25,   198,    40,  1833, 17903,   880,    26],
        [   13,  1649,    11,   327,  1872,   385,    11, 10598],
        [20889,   329,    26, 26509, 11906,  3956,   198,  3886],
        [  198,   464, 12296,   286,   534, 34685,    26, 10598],
        [  262, 17435,    26,   356,   389,  2677,  8616,   338],
        [ 2390,  8267,    40,  2937,    25,   19

In [8]:
for t in range(block_size):
    context = xb[0][:t+1].tolist()  # Convert to list
    target = [yb[0][t].item()]      # Convert to a single-element list
    print(f"when input is:\n {context} the target:\n {target}")

when input is:
 [1225] the target:
 [713]
when input is:
 [1225, 713] the target:
 [290]
when input is:
 [1225, 713, 290] the target:
 [32363]
when input is:
 [1225, 713, 290, 32363] the target:
 [257]
when input is:
 [1225, 713, 290, 32363, 257] the target:
 [582]
when input is:
 [1225, 713, 290, 32363, 257, 582] the target:
 [338]
when input is:
 [1225, 713, 290, 32363, 257, 582, 338] the target:
 [30]
when input is:
 [1225, 713, 290, 32363, 257, 582, 338, 30] the target:
 [198]


In [9]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx):
        logits = self.token_embedding_table(idx)  # (B,T,C)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        return logits  # return raw logits

    def generate(self, idx, max_new_tokens):
        old = idx
        for _ in range(max_new_tokens):
            logits = self(old)
            probabilities = F.softmax(logits, dim=-1)  # apply softmax here for sampling
            idx_next = torch.multinomial(probabilities, num_samples=1)  # (B, 1)
            old = idx_next
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

In [10]:
model = BigramLanguageModel(enc.n_vocab)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = torch.nn.CrossEntropyLoss()
logits = model(xb)
BT, _ = logits.shape

In [11]:
targets = yb.view(BT)
loss = criterion(logits, targets)

print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=10)[0].tolist()))

! energeticLAB pressures Harlem loyaltyλensive hysteria population explosives


In [12]:
model.to(device)
# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)  # Move data to GPU

        # Forward pass
        logits = model(xb)
        loss = criterion(logits, yb)

        # Backward pass and optimization
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {average_loss:.4f}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 9.41 GiB (GPU 0; 5.77 GiB total capacity; 0 bytes already allocated; 5.05 GiB free; 0 bytes reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF