In [1]:
from bpe import BasicTokenizer

tokenizer = BasicTokenizer()
tokenizer.load(model_file='./output/tokenizer/tokenzier_v1.model')

def get_vocab_size(tokenizer_param: BasicTokenizer):
    return len(tokenizer_param.vocab) + len(tokenizer_param.special_tokens)

In [2]:
import torch
torch.manual_seed(3462)

print(f"PyTorch версия: {torch.__version__}")
print(f"CUDA версия: {torch.version.cuda}")
print(f"CuDNN версия: {torch.backends.cudnn.version()}")

PyTorch версия: 2.8.0+cu126
CUDA версия: 12.6
CuDNN версия: 91002


In [4]:
from transformers import GPTLanguageModel

block_size = 256
n_embedding = 512
n_head = 8
n_layer = 4
dropout = 0.2
batch_size = 64
vocab_size = get_vocab_size(tokenizer)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPTLanguageModel(
    vocab_size=vocab_size,
    block_size=block_size,
    n_embeddings=n_embedding,
    n_head=n_head,
    device=device,
    n_layers=n_layer,
    dropout=dropout
)
model.to(device)
model = torch.compile(model)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')
device

13.79329 M parameters


'cuda'

In [403]:
checkpoint_path = f'./output/pretrain/v3/checkpoint900.pth'
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [5]:
with open('./output/text_corpus.txt', 'r', encoding='utf-8') as f:
    text_corpus = f.read()

encoded_text = tokenizer.encode(text_corpus)
len(encoded_text)

255729

In [6]:
data = torch.tensor(encoded_text, dtype=torch.long)
split_index = int(0.9*len(data))
train_data = data[:split_index]
valid_data = data[split_index:]

In [7]:
from typing import Tuple

def get_batch(split: str) -> Tuple[torch.tensor, torch.tensor]:
    date = train_data if split == 'train' else valid_data
    index = torch.randint(len(data) - block_size, (batch_size,) )
    x = torch.stack([data[i:i+block_size] for i in index])
    y = torch.stack([data[i+1:i+1+block_size] for i in index])
    x,y = x.to(device), y.to(device)
    return x,y

In [8]:
x,y = get_batch('train')
x.shape, y.shape

(torch.Size([64, 256]), torch.Size([64, 256]))

In [15]:
from typing import Dict

eval_iters = 200

@torch.no_grad()
def estimate_loss() -> Dict:
    output = {}
    model.eval()
    for split in ['train', 'valid']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x,y = get_batch(split)
            _, loss = model(x,y)
            losses[k] = loss
        output[split] = losses
    model.train()
    return output

In [12]:
def save_checkpoint(model: torch.nn.Module,
                    optimizer: torch.optim.Optimizer,
                    epoch: int,
                    loss: float,
                    file_path: str = 'checkpoint.pth'
                    ) -> None:
    checkpoint = {'epoch': epoch,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'loss': loss
                  }
    torch.save(checkpoint, file_path)

In [19]:
from tqdm import tqdm

max_iters = 1000
eval_intervals = 10
learning_rate = 1e-4
save_intervals = 100
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_losses = []
valid_losses = []

for iteration in tqdm(range(max_iters)):
    if iteration % eval_intervals == 0 or iteration == max_iters - 1:
        losses = estimate_loss()
        print(f'step {iteration}'
              f'train loss: {losses['train']:.4f}'
              f'valid loss: {losses["valid"]:.4f}')
        train_losses.append(losses['train'])
        valid_losses.append(losses["valid"])

    x_batch, y_batch = get_batch('train')
    logits, loss = model(x_batch, y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if iteration % save_intervals == 0:
        save_checkpoint(model, optimizer, iteration, loss, f'./output/pretrain/v3/checkpoint{iteration}.pth')





  0%|          | 0/1000 [00:09<?, ?it/s]


TritonMissing: Cannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at: https://github.com/triton-lang/triton

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


<All keys matched successfully>

In [10]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label="Train Loss", marker='o')
plt.plot(valid_losses, label="Validation Loss", marker='o')
plt.xlabel("Evaluation Step")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Time")
plt.legend()
plt.grid()
plt.show()

NameError: name 'train_losses' is not defined

<Figure size 1000x500 with 0 Axes>

In [405]:
input_tokens = tokenizer.encode('Что нового?')
input_tokens = torch.tensor(input_tokens, dtype=torch.long).unsqueeze(0).to(device)
model.eval()
with torch.no_grad():
    output = model.generate(input_tokens, 50)
a = output[0]
print(tokenizer.decode(a.tolist()))

Что нового?\nМы ростелекома)ОбъявлосьУважаемые соседи! А что нужно с мобильным мужелини из 3 секциегоа не трети и в другой день! А� гу
