In [3]:
# set up logging
import logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

In [4]:
from mingpt.utils import set_seed
set_seed(42)

In [5]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [16]:
%%writefile dataset.py  
# jupyter的魔法命令，将下面的代码保存到dataset.py 方便调用

import math
import torch
from torch.utils.data import Dataset

class CharDataset(Dataset):
    
    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print(f"data has {data_size} characters, {vocab_size} unique.")

        self.stoi = {ch : i for i, ch in enumerate(chars)}
        self.itos = {i : ch for i, ch in enumerate(chars)}
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size # 为啥 - block_size? 因为最后 block_size 个字符不能作为输入
    
    def __getitem__(self, idx):
        chunk = self.data[idx : idx + self.black_size + 1]  # 多一个字符作为target
        # encode
        dix = [self.stoi[s] for s in chunk]
        
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y


Overwriting dataset.py


In [17]:
block_size = 128

In [None]:
!curl  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o input.txt

In [21]:
from dataset import CharDataset
text = open('input.txt', 'r').read()
train_dataset = CharDataset(text, block_size)

data has 1115394 characters, 65 unique.


In [23]:
from mingpt.model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=512)
model = GPT(mconf)

12/05/2025 15:37:33 - INFO - mingpt.model - number of parameters: 2.535219e+07


In [24]:
from mingpt.trainer import Trainer, TrainerConfig

tconf = TrainerConfig(max_epochs=2, batch_size=512, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=0)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

  0%|          | 0/2179 [00:20<?, ?it/s]


KeyboardInterrupt: 

In [25]:
from mingpt.utils import sample
context = "O God, O God!"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None, ...].to(trainer.device)
y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

KeyboardInterrupt: 