In [3]:
from bpe import BasicTokenizer

tokenizer = BasicTokenizer()
tokenizer.load(model_file='./output/tokenizer/tokenzier_v1.model')

def get_vocab_size(tokenizer_param: BasicTokenizer):
    return len(tokenizer_param.vocab) + len(tokenizer_param.special_tokens)

In [4]:
import torch
torch.manual_seed(3462)

print(f"PyTorch версия: {torch.__version__}")
print(f"CUDA версия: {torch.version.cuda}")
print(f"CuDNN версия: {torch.backends.cudnn.version()}")

PyTorch версия: 2.8.0+cu126
CUDA версия: 12.6
CuDNN версия: 91002


In [5]:
from transformers import GPTLanguageModel

block_size = 256
n_embedding = 512
n_head = 8
n_layer = 4
dropout = 0.2
batch_size = 64
vocab_size = get_vocab_size(tokenizer)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPTLanguageModel(
    vocab_size=vocab_size,
    block_size=block_size,
    n_embeddings=n_embedding,
    n_head=n_head,
    device=device,
    n_layers=n_layer,
    dropout=dropout
)
model.to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')
device

13.79329 M parameters


'cuda'

In [6]:
with open('./output/text_corpus.txt', 'r', encoding='utf-8') as f:
    text_corpus = f.read()

encoded_text = tokenizer.encode(text_corpus)
len(encoded_text)

255729

In [7]:
data = torch.tensor(encoded_text, dtype=torch.long)
split_index = int(0.9*len(data))
train_data = data[:split_index]
valid_data = data[split_index:]

In [8]:
from typing import Tuple
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, data: torch.Tensor, block_size:int) -> None:
        if len(data) <= block_size:
            raise ValueError(
                f'The length of the dataset ({len(data)}) must be greater than the block size ({block_size})'
            )
        self.data = data
        self.block_size = block_size

    def __len__(self) -> int:
        return len(self.data) - self.block_size
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor]:
        x = self.data[index: index + self.block_size]
        y = self.data[index + 1: index + self.block_size + 1]
        return x, y

def get_dataloader(train_data: torch.Tensor,
                   valid_data: torch.Tensor,
                   block_size: int,
                   batch_size: int,
                   device: torch.device) -> Tuple[DataLoader, DataLoader]:
    train_dataset = TextDataset(train_data.to(device), block_size)
    valid_dataset = TextDataset(valid_data.to(device), block_size)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=False)
    return train_loader, valid_loader

In [9]:
train_loader, valid_loader = get_dataloader(train_data=train_data,
                                            valid_data=valid_data,
                                            block_size=block_size,
                                            batch_size=batch_size,
                                            device=device)
x,y = next(iter(train_loader))
x.shape, y.shape

(torch.Size([64, 256]), torch.Size([64, 256]))

In [10]:
from typing import Dict

@torch.no_grad()
def estimate_loss(
        model: torch.nn.Module,
        train_loader: DataLoader,
        valid_loader: DataLoader,
        eval_iterations: int
) -> Dict[str, float]:
    output = {}
    model.eval()
    for split, loader in [('train', train_loader), ('valid', valid_loader)]:
        loses = torch.zeros(eval_iterations)
        for i, (x,y) in enumerate(loader):
            if i >= eval_iterations:
                break
            with torch.no_grad():
                _, loss = model(x,y)
            loses[i] = loss.item()
        output[split] = loses.mean().item()

    model.train()
    return output


In [11]:
def save_checkpoint(model: torch.nn.Module,
                    optimizer: torch.optim.Optimizer,
                    epoch: int,
                    loss: float,
                    file_path: str = 'checkpoint.pth'
                    ) -> None:
    checkpoint = {'epoch': epoch,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'loss': loss
                  }
    torch.save(checkpoint, file_path)

In [None]:
from tqdm import tqdm
max_iters = 1
eval_interval = 100
eval_iters = 200
learning_rate = 1e-4

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_loader, val_loader = get_dataloader(train_data=train_data,
                                          valid_data=valid_data,
                                          block_size=block_size,
                                          batch_size=batch_size,
                                          device=device)

train_losses = []
valid_losses = []

for iteration in range(max_iters):
    for batch_idx, (x_batch, y_batch) in tqdm(enumerate(train_loader), total=len(train_loader)):
        if batch_idx % eval_interval == 0 or batch_idx == len(train_loader) - 1:
            loss = estimate_loss(model,
                                 train_loader,
                                 val_loader,
                                 min(eval_iters, len(val_loader))
                                 )
            train_losses.append(loss['train'])
            valid_losses.append(loss['valid'])
            print(f'iteration: {iteration} / step : {batch_idx}, '
                  f'train_losses: {loss["train"]:.4f},'
                  f'valid_losses: {loss["valid"]:.4f}'
                  )
        logits, loss = model(x_batch, y_batch)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    save_checkpoint(model,
                    optimizer,
                    iteration,
                    loss.item(),
                    file_path=f'./output/pretrain/v2/checkpoint_{iteration}.pth')


  0%|          | 0/3593 [00:00<?, ?it/s]

iteration: 0 / step : 0, train_losses: 0.2386,valid_losses: 6.0420


  3%|▎         | 101/3593 [03:26<22:06:29, 22.79s/it]

iteration: 0 / step : 100, train_losses: 0.2213,valid_losses: 6.1100


  6%|▌         | 201/3593 [05:40<21:25:48, 22.74s/it]

iteration: 0 / step : 200, train_losses: 0.2102,valid_losses: 6.2026


  8%|▊         | 301/3593 [07:54<20:48:26, 22.75s/it]

iteration: 0 / step : 300, train_losses: 0.2023,valid_losses: 6.2731


 11%|█         | 401/3593 [10:07<20:08:09, 22.71s/it]

iteration: 0 / step : 400, train_losses: 0.1934,valid_losses: 6.3360


 14%|█▍        | 501/3593 [12:21<19:31:50, 22.74s/it]

iteration: 0 / step : 500, train_losses: 0.1849,valid_losses: 6.4197


 17%|█▋        | 601/3593 [14:38<19:34:38, 23.56s/it]

iteration: 0 / step : 600, train_losses: 0.1811,valid_losses: 6.4578


 20%|█▉        | 701/3593 [16:54<18:33:18, 23.10s/it]

iteration: 0 / step : 700, train_losses: 0.1748,valid_losses: 6.5276


 22%|██▏       | 801/3593 [19:09<18:00:28, 23.22s/it]

iteration: 0 / step : 800, train_losses: 0.1708,valid_losses: 6.5823


 25%|██▌       | 901/3593 [21:23<17:00:23, 22.74s/it]

iteration: 0 / step : 900, train_losses: 0.1647,valid_losses: 6.6314


 28%|██▊       | 1001/3593 [23:38<16:38:00, 23.10s/it]

iteration: 0 / step : 1000, train_losses: 0.1614,valid_losses: 6.7198


 31%|███       | 1101/3593 [25:56<16:25:43, 23.73s/it]

iteration: 0 / step : 1100, train_losses: 0.1575,valid_losses: 6.7489


 33%|███▎      | 1201/3593 [28:13<15:37:40, 23.52s/it]

iteration: 0 / step : 1200, train_losses: 0.1540,valid_losses: 6.7951


 36%|███▌      | 1301/3593 [30:29<14:38:21, 22.99s/it]

iteration: 0 / step : 1300, train_losses: 0.1504,valid_losses: 6.8306


 39%|███▉      | 1401/3593 [32:44<13:58:39, 22.96s/it]

iteration: 0 / step : 1400, train_losses: 0.1476,valid_losses: 6.8949


 42%|████▏     | 1501/3593 [34:59<13:22:32, 23.02s/it]

iteration: 0 / step : 1500, train_losses: 0.1451,valid_losses: 6.9641


 45%|████▍     | 1601/3593 [37:14<12:42:33, 22.97s/it]

iteration: 0 / step : 1600, train_losses: 0.1431,valid_losses: 6.9718


 47%|████▋     | 1701/3593 [1:07:44<274:57:53, 523.19s/it]

iteration: 0 / step : 1700, train_losses: 0.1402,valid_losses: 7.0275


 50%|████▉     | 1779/3593 [1:08:32<18:59,  1.59it/s]     

In [1]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label="Train Loss", marker='o')
plt.plot(valid_losses, label="Validation Loss", marker='o')
plt.xlabel("Evaluation Step")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Time")
plt.legend()
plt.grid()
plt.show()

NameError: name 'train_losses' is not defined

<Figure size 1000x500 with 0 Axes>

In [12]:
checkpoint_path = f'./output/pretrain/checkpoint_0.pth'
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])


<All keys matched successfully>

In [23]:
input_tokens = tokenizer.encode('Привет')
input_tokens = torch.tensor(input_tokens, dtype=torch.long).unsqueeze(0).to(device)
model.eval()
with torch.no_grad():
    output = model.generate(input_tokens, 100)

print(tokenizer.decode(output[0].tolist()))


Приветросьба назад по видео. Ну высужно своих соседей. Тажется что кто-то голосует )Доброй ночи всегдачи! \nМыездаю, что есть проблемы с ребенок не бегут на трубает \nНапоминать, закрыли за чем я лично, а это свой счёт водой, только через прогул
