# Carregamento dos Pacotes

In [1]:
%env TF_CPP_MIN_LOG_LEVEL = 3

env: TF_CPP_MIN_LOG_LEVEL=3


In [2]:
import torch
import accelerate
import transformers
import numpy as np
import torch.nn.functional as F
from accelerate import Accelerator
from torch.utils.data import Dataset
from transformers import AutoConfig, AutoModelForCausalLM, GPT2Tokenizer
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Carregando o LLM

In [3]:
vocab_size = 13
sequence_length = 4
result_length = 2
context_length = sequence_length + result_length

In [4]:
config = AutoConfig.from_pretrained('gpt2',
                                    vocab_size = vocab_size,
                                    n_ctx = context_length,
                                    n_head = 4,
                                    n_layer = 2)

In [5]:
model = AutoModelForCausalLM.from_config(config)

In [6]:
def calcula_tamanho_modelo(model):
    return sum(t.numel() for t in model.parameters())

In [7]:
print(f'Tamanho do Modelo: {calcula_tamanho_modelo(model) / 1000 ** 2:.1f}M parâmetros')

Tamanho do Modelo: 15.0M parâmetros


In [8]:
type(model)

transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

# Criando Tokenizador Personalizado

In [9]:
class Tokenizer:

    def __init__(self, numbers_qty = 10):
        vocab = ['+', '=', '-1', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
        self.numbers_qty = numbers_qty
        self.pad_token = '-1'
        self.encoder = {str(v):i for i, v in enumerate(vocab)}
        self.decoder = {i:str(v) for i, v in enumerate(vocab)}
        self.pad_token_id = self.encoder[self.pad_token]

    def decode(self, token_ids):
        return ' '.join(self.decoder[t] for t in token_ids)

    def __call__(self, text):
        return [self.encoder[t] for t in text.split()]

In [10]:
tokenizer = Tokenizer(vocab_size)

In [11]:
tokenizer.decoder

{0: '+',
 1: '=',
 2: '-1',
 3: '0',
 4: '1',
 5: '2',
 6: '3',
 7: '4',
 8: '5',
 9: '6',
 10: '7',
 11: '8',
 12: '9'}

In [12]:
tokenizer('1 + 1 = 2')

[4, 0, 4, 1, 5]

In [13]:
tokenizer('9 + 5 = 14')

KeyError: '14'

In [14]:
tokenizer('9 + 5 = 1 4')

[12, 0, 8, 1, 4, 7]

In [15]:
tokenizer('19 + 5 = 2 4')

KeyError: '19'

# Organizando o Formato dos Dados

In [24]:
class OrganizaDataset(Dataset):

    def __init__(self, split, length = 6):
        assert split in {'treino', 'teste'}
        self.split = split
        self.length = length

    def __len__(self):
        return 1000000

    def __getitem__(self, idx):
        available_numbers = [int(n) for n in tokenizer.decoder.values() if n != tokenizer.pad_token and str(n).isnumeric()]
        inp = torch.tensor(np.random.choice(available_numbers, size = result_length))
        sol = torch.tensor([int(i) for i in str(inp.sum().item())])
        sol = torch.nn.functional.pad(sol, (1 if sol.size()[0] == 1 else 0,0), 'constant', 0)
        cat = torch.cat((inp, sol), dim = 0)
        x = cat[:-1].clone()
        y = cat[1:].clone()
        y[:1] = int(tokenizer.pad_token)
        x = str(x[0].item()) + ' + ' + str(x[1].item()) + ' = ' + str(x[2].item())
        y = '-1 ' + str(y[0].item()) + ' -1 ' + str(y[1].item()) + ' ' + str(y[2].item())
        tokenized_input = tokenizer(x)
        tokenized_output = tokenizer(y)

        return torch.tensor(tokenized_input), torch.tensor(tokenized_output)

# Criando os Datasets de Treino e Teste

In [32]:
dataset_treino = OrganizaDataset('treino', length=sequence_length)

In [33]:
dataset_teste = OrganizaDataset('teste', length= sequence_length)

In [34]:
x, y = dataset_treino[0]

In [35]:
x

tensor([ 5,  0, 12,  1,  4])

In [36]:
y

tensor([2, 2, 2, 4, 4])

In [37]:
print(tokenizer.decode(x.numpy()))

2 + 9 = 1


In [38]:
print(tokenizer.decode(y.numpy()))

-1 -1 -1 1 1


# Criando o Loop de Treinamento

In [39]:
num_epochs = 2
batch_size = 100

In [40]:
optimizer = torch.optim.Adam(model.parameters())

In [41]:
data = torch.utils.data.DataLoader(dataset_treino, shuffle = True, batch_size = batch_size)

In [42]:
accelerator = Accelerator()

In [43]:
model, optimizer, data = accelerator.prepare(model, optimizer, data)

In [44]:
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(13, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-1): 2 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=13, bias=False)
)

# Treinando o LLM (Ajuste Fino) com Nossos Próprios Dados

In [49]:
%%time

print('\nIniciando o Ajuste Fino do LLM... Seja Paciente e Aguarde!')

for epoch in range(num_epochs):

    for source, targets in data:
        optimizer.zero_grad()
        loss = F.cross_entropy(model(source).logits.flatten(end_dim = 1),
                               targets.flatten(end_dim = 1),
                               ignore_index= tokenizer.pad_token_id)
        accelerator.backward(loss)
        optimizer.step()
        loss = F.cross_entropy(model(source).logits.flatten(end_dim = 1),
                               targets.flatten(end_dim = 1),
                               ignore_index = tokenizer.pad_token_id)

    print(f'\nEpoch: {epoch+1} / {num_epochs} -- Erro: {loss.item()}')

print('\nAjuste Fino do LLM Concluído com Sucesso.\n')


Iniciando o Ajuste Fino do LLM... Seja Paciente e Aguarde!

Epoch: 1 / 2 -- Erro: 0.03602537512779236

Epoch: 2 / 2 -- Erro: 0.029243573546409607

Ajuste Fino do LLM Concluído com Sucesso.

CPU times: user 9min 27s, sys: 1.42 s, total: 9min 28s
Wall time: 9min 28s


# Avaliação do LLM

In [55]:
def faz_previsao(entrada, solution_length = 6, model = model):

    model.eval()
    entrada = torch.tensor(tokenizer(entrada))
    entrada = entrada.to(accelerator.device)
    solution = []
    
    for i in range(solution_length):
        saida = model(entrada)
        predicted = saida.logits[-1].argmax()
        entrada = torch.cat((entrada, predicted.unsqueeze(0)), dim = 0)
        solution.append(predicted.cpu().item())

    return tokenizer.decode(solution)

In [58]:
def avalia_modelo(num_samples = 1000, log = False):

    correct = 0
    for i in range(num_samples):
        entrada, target = dataset_teste[i]
        entrada = entrada.cpu().numpy()
        target = target.cpu().numpy()
        entrada = tokenizer.decode(entrada[:sequence_length])
        target = tokenizer.decode(target[sequence_length-1:])
        predicted = faz_previsao(entrada, solution_length = result_length, model = model)

        if target == predicted:
            correct += 1
            if log:
                print(f'Acerto do Modelo: Input: {entrada} Target: {target} Previsão: {predicted}')

        else:
            if log:
              print(f'Erro do Modelo: Input: {entrada} Target: {target} Previsão: {predicted}')

    print(f'\nAcurácia: {correct/num_samples}')

In [59]:
avalia_modelo(num_samples= 10, log = True)

Acerto do Modelo: Input: 7 + 1 = Target: 0 8 Previsão: 0 8
Acerto do Modelo: Input: 7 + 6 = Target: 1 3 Previsão: 1 3
Acerto do Modelo: Input: 3 + 7 = Target: 1 0 Previsão: 1 0
Acerto do Modelo: Input: 3 + 4 = Target: 0 7 Previsão: 0 7
Acerto do Modelo: Input: 6 + 7 = Target: 1 3 Previsão: 1 3
Acerto do Modelo: Input: 5 + 5 = Target: 1 0 Previsão: 1 0
Acerto do Modelo: Input: 2 + 5 = Target: 0 7 Previsão: 0 7
Acerto do Modelo: Input: 5 + 4 = Target: 0 9 Previsão: 0 9
Acerto do Modelo: Input: 0 + 7 = Target: 0 7 Previsão: 0 7
Acerto do Modelo: Input: 2 + 1 = Target: 0 3 Previsão: 0 3

Acurácia: 1.0


In [60]:
avalia_modelo(num_samples= 1000, log= False)


Acurácia: 1.0


In [61]:
type(model)

transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel

In [62]:
model.save_pretrained('llm/llm_final')