In [16]:
from jinja2.lexer import ignored_tokens

from bpe import RegexTokenizer

tokenizer = RegexTokenizer()
tokenizer.load(model_file='./output/tokenizer/tokenzier_v2.model')

def get_vocab_size(tokenizer_param: RegexTokenizer):
    return len(tokenizer_param.vocab) + len(tokenizer_param.special_tokens)

In [17]:
import json

with open('./output/finetune_text_corpus.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
data

["<|startoftext|>KKS<|separator|>[{'type': 'mention_name', 'text': 'Айрат', 'user_id': 1420112877}, ' добрый вечер! \\nВы не возражаете, если QR-код с приглашением в эту группу управляющая разместит на досках объявлений в подъездах?']<|endoftext|>",
 '<|startoftext|>Айрат Ф<|separator|>Здравствуйте! Не возражаю конечно<|endoftext|>',
 '<|startoftext|>Евгений Ж<|separator|>Добрый день соседи! Евгений 59 квартира<|endoftext|>',
 '<|startoftext|>Павел Лепихин<|separator|>Всем привет! Павел, Ирина, 97 ка.<|endoftext|>',
 '<|startoftext|>Евгений Ж<|separator|>А кто администратор группы? Добавьте сюда председателя управляющей компании!<|endoftext|>',
 '<|startoftext|>KKS<|separator|>Может добавить любой участник. Если есть контакты председателя - Вы можете это сделать.<|endoftext|>',
 '<|startoftext|>KKS<|separator|><|endoftext|>',
 '<|startoftext|>Елена Захарова<|separator|>Всем добрый день!\nУ меня есть его контактный номер телефона,но едва ли он будет согласен на присутствие в общедомовой

In [18]:
tokenized_data = []
for item in data:
    tokenized_item = tokenizer.encode(item, "all")
    tokenized_data.append(tokenized_item)
tokenized_data

[[1024,
  770,
  1025,
  91,
  643,
  639,
  456,
  376,
  109,
  287,
  116,
  105,
  595,
  968,
  651,
  101,
  537,
  376,
  272,
  456,
  376,
  382,
  332,
  837,
  537,
  376,
  117,
  289,
  114,
  95,
  105,
  100,
  456,
  32,
  49,
  52,
  50,
  48,
  49,
  49,
  582,
  55,
  55,
  125,
  44,
  376,
  1016,
  652,
  33,
  32,
  92,
  110,
  405,
  281,
  335,
  681,
  263,
  384,
  434,
  259,
  44,
  713,
  32,
  81,
  82,
  45,
  276,
  318,
  286,
  694,
  346,
  512,
  320,
  366,
  338,
  283,
  418,
  269,
  826,
  269,
  333,
  904,
  950,
  352,
  374,
  525,
  310,
  506,
  343,
  336,
  306,
  353,
  490,
  316,
  424,
  510,
  278,
  302,
  268,
  893,
  283,
  592,
  471,
  316,
  63,
  39,
  93,
  1026],
 [1024,
  382,
  332,
  837,
  256,
  164,
  1025,
  927,
  33,
  483,
  259,
  681,
  263,
  384,
  397,
  832,
  446,
  327,
  1026],
 [1024,
  503,
  939,
  893,
  256,
  150,
  1025,
  505,
  507,
  756,
  33,
  844,
  939,
  893,
  32,
  53,
  57,
  511,
  

In [19]:
initial_split_index = int(0.9*len(data))

split_index = initial_split_index
while split_index > 0 and not data[split_index - 1].startswith("<|startoftext|>"):
    split_index -=1

train_data = data[:split_index]
val_data = data[split_index:]

print(f"Training Set:")
print(f"Start message: {train_data[0].split('<|separator|>')[0]} ")
print(f"End message: {train_data[-1].split('<|separator|>')[0]}")

print(f"Validation Set:")
print(f"Start message: {val_data[0].split('<|separator|>')[0]} ")
print(f"End message: {val_data[-1].split('<|separator|>')[0]}")

Training Set:
Start message: <|startoftext|>KKS 
End message: <|startoftext|>Dasha
Validation Set:
Start message: <|startoftext|>Dasha 
End message: <|startoftext|>Айтакин Аббасова


In [20]:
train_data = tokenized_data[:split_index]
val_data = tokenized_data[split_index:]

In [26]:
block_size = 256

def combine_turns(data: list[int], should_trim_long_sequences: bool) -> list[int]:
    combine_turns_data = []
    for i in range(0, len(data) -1, 2):
        your_message = data[i]
        assistant_message = data[i+1]
        if not your_message or not assistant_message:
            continue
        final_message = your_message + assistant_message
        if len(final_message) > block_size and should_trim_long_sequences:
            final_message = final_message[-block_size:]

        combine_turns_data.append(final_message)
    return combine_turns_data

combined_train_data = combine_turns(train_data, should_trim_long_sequences=True)
combined_val_data = combine_turns(val_data, should_trim_long_sequences=True)

In [27]:
print(f"Training Set:")
print(f"Length: {len(train_data)} ")
print(f"Length combined data: {len(combined_train_data)} ")

print(f"Validation Set:")
print(f"Length: {len(val_data)} ")
print(f"Length combined data: {len(combined_val_data)} ")


Training Set:
Length: 6930 
Length combined data: 3465 
Validation Set:
Length: 771 
Length combined data: 385 


In [32]:
import torch
torch.manual_seed(3647)

padding_token = tokenizer.special_tokens["<|padding|>"]

def apply_padding_to_data( data:list[int], block_size:int, padding_token:int) -> torch.Tensor:
    tensors = []
    for i in range(len(data)):
        tensor = torch.tensor(data[i])
        padded_tensor = torch.nn.functional.pad(input=tensor,
                                                pad=(0,block_size - len(tensor)),
                                                value=padding_token)
        tensors.append(padded_tensor)
    return torch.stack(tensors)
train_data_tensor = apply_padding_to_data(combined_train_data,
                                          block_size,
                                          padding_token)
val_data_tensor = apply_padding_to_data(combined_val_data,
                                        block_size,
                                        padding_token)

In [34]:
train_data_tensor[0]

tensor([1024,  770, 1025,   91,  643,  639,  456,  376,  109,  287,  116,  105,
         595,  968,  651,  101,  537,  376,  272,  456,  376,  382,  332,  837,
         537,  376,  117,  289,  114,   95,  105,  100,  456,   32,   49,   52,
          50,   48,   49,   49,  582,   55,   55,  125,   44,  376, 1016,  652,
          33,   32,   92,  110,  405,  281,  335,  681,  263,  384,  434,  259,
          44,  713,   32,   81,   82,   45,  276,  318,  286,  694,  346,  512,
         320,  366,  338,  283,  418,  269,  826,  269,  333,  904,  950,  352,
         374,  525,  310,  506,  343,  336,  306,  353,  490,  316,  424,  510,
         278,  302,  268,  893,  283,  592,  471,  316,   63,   39,   93, 1026,
        1024,  382,  332,  837,  256,  164, 1025,  927,   33,  483,  259,  681,
         263,  384,  397,  832,  446,  327, 1026, 1028, 1028, 1028, 1028, 1028,
        1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
        1028, 1028, 1028, 1028, 1028, 10

In [35]:
train_data_tensor.shape

torch.Size([3465, 256])

In [40]:
from typing import Tuple
from torch.utils.data import DataLoader, Dataset

class FinetuneDataset(Dataset):
    def __init__(self, data: torch.Tensor, device: torch.device, padding_token:int):
        self.data = data
        self.device = device
        self.padding_token = padding_token
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx: int):
        sample = self.data[idx]
        x = sample.to(device=self.device)
        y = sample[1:].to(device=self.device)
        padding_tensor = torch.tensor([self.padding_token]).to(device=self.device)
        y = torch.cat((y, padding_tensor))
        return x, y

batch = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataset = FinetuneDataset(train_data_tensor, device, padding_token)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)

val_dataset = FinetuneDataset(val_data_tensor, device, padding_token)
val_loader = DataLoader(val_dataset, batch_size=batch, shuffle=True)



In [45]:
x, y = next(iter(train_loader))
print(x.shape, y.shape)

torch.Size([64, 256]) torch.Size([64, 256])


In [46]:
from transformers import GPTLanguageModel

block_size = 256
n_embedding = 512
n_head = 8
n_layer = 4
dropout = 0.2
batch_size = 64
vocab_size = get_vocab_size(tokenizer)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPTLanguageModel(
    vocab_size=vocab_size,
    block_size=block_size,
    n_embeddings=n_embedding,
    n_head=n_head,
    device=device,
    n_layers=n_layer,
    dropout=dropout,
    ignore_index= tokenizer.special_tokens["<|padding|>"]
)
model.to(device)
model = torch.compile(model)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

13.79329 M parameters


In [47]:
checkpoint_path = f'./output/pretrain/v3/checkpoint100.pth'
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [55]:
input_tokens = tokenizer.encode('Что нового?', allowed_special_tokens="all")
input_tokens = torch.tensor(input_tokens, dtype=torch.long).unsqueeze(0).to(device)
model.eval()
with torch.no_grad():
    output = model.generate(input_tokens, 50)
a = output[0]
print(tokenizer.decode(a.tolist()))

Что нового?ожнонаstsep другымходможнокуатьа� урstarto больатьм)<|касsepara домрожеилтнонуюon Брусникаov квся вр р соб , подto больможал<|оКовto


In [60]:
from typing import Dict

eval_iters = 200

@torch.no_grad()
def estimate_loss(
        model: torch.nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
) -> Dict[str, float]:
    output = {}
    model.eval()
    for split, loader in [('train', train_loader), ('valid',val_loader)]:
        losses = []
        for x,y in loader:
            with torch.no_grad():
                _, loss = model(x,y)
            losses.append(loss.item())
        output[split] = sum(losses)/len(losses)
    model.train()
    return output

In [61]:
def save_checkpoint(model: GPTLanguageModel,
                    optimizer: torch.optim.Optimizer,
                    epoch: int,
                    loss: float,
                    file_path: str = 'checkpoint.pth'
                    ) -> None:
    checkpoint = {'epoch': epoch,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'loss': loss
                  }
    torch.save(checkpoint, file_path)

In [62]:
from tqdm import tqdm

max_iters = 1000
eval_intervals = 10
learning_rate = 6e-5
save_intervals = 100
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_losses = []
valid_losses = []

for iteration in tqdm(range(max_iters)):
    for batch_idx, (x_batch, y_batch) in enumerate(train_loader):
        if batch_idx % eval_intervals == 0 or batch_idx == len(train_loader):
            losses = estimate_loss(model, train_loader, val_loader)
            train_losses.append(losses['train'])
            valid_losses.append(losses["valid"])
            print(f'iteration {iteration} / step {batch_idx}'
                  f'train loss: {losses['train']:.4f}'
                  f'valid loss: {losses["valid"]:.4f}'
                  )

        logits, loss = model(x_batch, y_batch)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        if iteration % save_intervals == 0:
          save_checkpoint(model, optimizer, iteration, loss, f'./output/pretrain/v4/checkpoint{iteration}.pth')

W0921 19:40:06.408000 18260 .venv\Lib\site-packages\torch\_inductor\utils.py:1436] [1/0] Not enough SMs to use max_autotune_gemm mode
  0%|          | 0/1000 [00:03<?, ?it/s]


TritonMissing: Cannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at: https://github.com/triton-lang/triton

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label="Train Loss", marker='o')
plt.plot(valid_losses, label="Validation Loss", marker='o')
plt.xlabel("Evaluation Step")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Time")
plt.legend()
plt.grid()
plt.show()

In [53]:
def get_input_tokens(message: str) -> torch.Tensor:
    input_tokens = tokenizer.encode(f"<|startoftext|>{message}<|separator|>", allowed_special_tokens="all")
    input_tokens = torch.tensor(input_tokens, dtype=torch.long).unsqueeze(0).to(device)
    return input_tokens

user_message = "Привет, бот"
input_tokens = get_input_tokens(user_message)
model_answer = ""

model.eval()
while True:
    output_tokens = model.generate(input_tokens, 1)
    last_generated_token = output_tokens[0,-1].item()
    if last_generated_token == tokenizer.special_tokens["<|endoftext|>"]:
        break
    input_tokens = torch.cat((input_tokens, output_tokens[:, -1,:]), dim=1)
    model_answer += tokenizer.decode([last_generated_token])
    if len(output_tokens[0]) > block_size:
        input_tokens = input_tokens[:, -block_size]

print(f"You: {user_message}")
print(f"Model: {model_answer}")

IndexError: too many indices for tensor of dimension 2