## Tokenizer

In [3]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed
set_seed(3407)
import random

In [1]:
StT = {
    "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
    "NumBeg": 10, "NumEnd": 11, "*": 12, "+": 13, "=": 14, "ThinkBeg": 15, "ThinkEnd": 16, "Eos": 17
}

TtS = {v: k for k, v in StT.items()}

def tokenize(s):
    out = []
    num = False
    past = ""
    for c in s:
        chars = past + c
        if chars in StT:
            tok = StT[chars]
            if tok < 10:
                if not num:
                    out.append(StT["NumBeg"])
                    num = True
                out.append(tok)
            else:
                if num:
                    out.append(StT["NumEnd"])
                    num = False
                out.append(tok)
            past = ""
        else:
            past += c

    return out + [StT["Eos"]]

def detokenize(toks):
    out = []
    num = False
    for tok in toks:
        if tok == 10 or tok == 11 or (tok >= 15 and tok <= 17) :
            continue
        out.append(TtS[tok])

    return "".join(out)

example = "123+456*6="
print(tokenize(example))
print(detokenize(tokenize(example)))


[10, 1, 2, 3, 11, 13, 10, 4, 5, 6, 11, 12, 10, 6, 11, 14, 17]
123+456*6=


## Dataset

In [7]:
def step_mul(length):
    a = [random.randint(0,9) for i in range(length)]
    b = [random.randint(0,9) for i in range(length)]
    # a = [9, 9, 9]
    # b = [9, 9, 9]

    val_a = int(''.join(str(d) for d in a))
    val_b = int(''.join(str(d) for d in b))
    if val_a < val_b:
        val_a, val_b = val_b, val_a
        a, b = b, a
    
    string = f"{val_a}*{val_b}=ThinkBeg"

    steps_eq = []
    for i in range(length):
        a_i = a[i] * 10**(length-i-1)
        # a_i = str(a[i]) 
        string += f"{a_i}*{val_b}+"
        steps_eq.append(str(a_i*val_b)+"+")

    string = string[:-1] + "=" + "".join(steps_eq)[:-1] + "=" + "ThinkEnd" + str(val_a*val_b)

    return string
    # print(out)

eq = step_mul(3)
print(eq)
tokenize(eq)
len(tokenize(eq))

644*314=ThinkBeg600*314+40*314+4*314=188400+12560+1256=ThinkEnd202216


79

In [8]:
class MulDataset(Dataset):
    """ 
    Dataset for the Add problem. E.g. for problem length 3:
    12 + 333 = 345
    Input: 0 1 2 3 3 3 -> Output: 0 3 4 5
    Which will feed into the transformer concatenated as:
    input:  0 1 2 3 3 3 0 3 4
    output: I I I I I 0 3 4 5
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
    
    def __len__(self):
        return 100000 # ...
    
    def get_vocab_size(self):
        return len(StT)
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return 79

    def __getitem__(self, idx):
        while True:
            rai = tokenize(step_mul(self.length))
            h = hash(str(rai[:1+2*(self.length+2)]))
            
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok
        
        if len(rai) < self.get_block_size():
            rai += [StT["Eos"]] * (self.get_block_size() - len(rai))
        
        x = torch.tensor(rai[:-1], dtype=torch.long)
        y = torch.tensor(rai[1:], dtype=torch.long)
        
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:2*(self.length+2)+1] = -1
        return x, y

In [9]:
# print an example instance of the dataset
train_dataset = MulDataset('train')
test_dataset = MulDataset('test')
x, y = train_dataset[0]

print (x)
for a, b in zip(x,y):
    print(int(a),int(b))

tensor([10,  3,  5,  9, 11, 12, 10,  1,  1,  9, 11, 14, 15, 10,  3,  0,  0, 11,
        12, 10,  1,  1,  9, 11, 13, 10,  5,  0, 11, 12, 10,  1,  1,  9, 11, 13,
        10,  9, 11, 12, 10,  1,  1,  9, 11, 14, 10,  3,  5,  7,  0,  0, 11, 13,
        10,  5,  9,  5,  0, 11, 13, 10,  1,  0,  7,  1, 11, 14, 16, 10,  4,  2,
         7,  2,  1, 17, 17, 17])
10 -1
3 -1
5 -1
9 -1
11 -1
12 -1
10 -1
1 -1
1 -1
9 -1
11 -1
14 15
15 10
10 3
3 0
0 0
0 11
11 12
12 10
10 1
1 1
1 9
9 11
11 13
13 10
10 5
5 0
0 11
11 12
12 10
10 1
1 1
1 9
9 11
11 13
13 10
10 9
9 11
11 12
12 10
10 1
1 1
1 9
9 11
11 14
14 10
10 3
3 5
5 7
7 0
0 0
0 11
11 13
13 10
10 5
5 9
9 5
5 0
0 11
11 13
13 10
10 1
1 0
0 7
7 1
1 11
11 14
14 16
16 10
10 4
4 2
2 7
7 2
2 1
1 17
17 17
17 17
17 17


## Model

In [11]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-mini'
# model_config.model_type = 'gpt-nano'

model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = GPT(model_config)

number of parameters: 2.69M


In [12]:
print (model_config.n_head, model_config.n_layer, model_config.n_embd)

6 6 192


In [13]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 1e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 5000
train_config.num_workers = 0
# train_config.batch_size = 32
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [14]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 2.92101
iter_dt 153.22ms; iter 100: train loss 0.99722
iter_dt 150.28ms; iter 200: train loss 0.66158
iter_dt 151.06ms; iter 300: train loss 0.53818
iter_dt 150.92ms; iter 400: train loss 0.49831
iter_dt 150.03ms; iter 500: train loss 0.47740
iter_dt 150.55ms; iter 600: train loss 0.42569
iter_dt 151.64ms; iter 700: train loss 0.39971
iter_dt 150.93ms; iter 800: train loss 0.37731
iter_dt 150.43ms; iter 900: train loss 0.33280
iter_dt 151.50ms; iter 1000: train loss 0.30704
iter_dt 150.90ms; iter 1100: train loss 0.29579
iter_dt 162.32ms; iter 1200: train loss 0.27111
iter_dt 151.02ms; iter 1300: train loss 0.26446
iter_dt 152.03ms; iter 1400: train loss 0.24715
iter_dt 150.52ms; iter 1500: train loss 0.24588
iter_dt 151.00ms; iter 1600: train loss 0.23221
iter_dt 154.69ms; iter 1700: train loss 0.20418
iter_dt 151.75ms; iter 1800: train loss 0.19791
iter_dt 151.38ms; iter 1900: train loss 0.18220
iter_dt 150.62ms; iter 2000: train loss 0.15697
iter_d

## Evaluation

In [15]:
# now let's perform some evaluation
model.eval()
None

In [16]:
class EvalMulDataset(Dataset):
    """ 
    Dataset for the Add problem. E.g. for problem length 3:
    12 + 333 = 345
    Input: 0 1 2 3 3 3 -> Output: 0 3 4 5
    Which will feed into the transformer concatenated as:
    input:  0 1 2 3 3 3 0 3 4
    output: I I I I I 0 3 4 5
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
    
    def __len__(self):
        return 100000 # ...
    
    def get_vocab_size(self):
        return len(StT)
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return 79

    def __getitem__(self, idx):
        while True:
            rai = tokenize(step_mul(self.length))
            h = hash(str(rai[:1+2*(self.length+2)]))
            
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok
        
        x = torch.tensor(rai[:-1], dtype=torch.long)[:rai.index(StT["="])+1]
        y = torch.tensor(rai[1:], dtype=torch.long)[rai.index(StT["ThinkEnd"]):]
        
        # we only want to predict at output locations, mask out the loss at the input locations
        return x, y
    
eval_train_dataset = EvalMulDataset('train')
eval_test_dataset = EvalMulDataset('test')
x, y = eval_train_dataset[0]

print (x)
print (y)
for a, b in zip(x,y):
    print(int(a),int(b))

tensor([10,  9,  8,  4, 11, 12, 10,  2,  1,  8, 11, 14])
tensor([10,  2,  1,  4,  5,  1,  2, 17])
10 10
9 2
8 1
4 4
11 5
12 1
10 2
2 17


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def evaluate_model_accuracy(model, eval_dataset):
    model.eval()
    correct = 0
    total = 0

    model.to(device)

    with torch.no_grad():
        for i in range(len(eval_dataset)):
            x, y = eval_dataset[i]
            x = x.unsqueeze(0).to(device)
            y = y.unsqueeze(0).to(device)

            output = model.generate(x, 79, do_sample=False)
            # print(detokenize(x[0].cpu().numpy()))
            # print(detokenize(y[0].cpu().numpy()))
            y = int(detokenize(y[0].cpu().numpy()))
            predicted = int(detokenize(output[0].cpu().numpy()).split("=")[-1])
            # print(predicted)

            # Compare predicted and actual values
            correct += 1 if y == predicted else 0
            total += 1

            if total % 100 == 0:
                print(f"Accuracy: {correct}/{total} = {correct/total:.2f}")

    accuracy = correct / total
    return accuracy

input_tensor = torch.tensor(tokenize("838*95="), dtype=torch.long).unsqueeze(0).to(device)
out = model.generate(input_tensor, 120, do_sample=False)
print(out)
print(detokenize(input_tensor[0].cpu().numpy()))
print(detokenize(out[0].cpu().numpy()))
# accuracy = evaluate_model_accuracy(model, eval_test_dataset)
# print(f"Model accuracy: {accuracy * 100:.2f}%")

tensor([[10,  8,  3,  8, 11, 12, 10,  9,  5, 11, 14, 17, 17, 17, 17, 17, 17, 17,
         17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
         17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  0,  0,  7,  3,  2,  0,  0,  0,
          0,  0,  2,  3,  8,  0,  0, 11, 13, 10,  7,  2,  0,  0, 11, 13, 10,  9,
          2,  2,  1,  0, 11, 13, 10,  7,  2, 11, 12, 10,  5,  2,  0, 11, 13, 10,
          5, 11, 12, 10,  0, 11, 12, 10,  0, 11, 12, 10,  0, 11, 14, 16, 10,  0,
         11, 14, 16, 10,  0, 11, 14, 16, 10,  0, 17, 17, 17, 17, 17, 17, 17, 17,
         17, 17, 17, 17, 17, 17]], device='cuda:0')
838*95=
838*95=007320000023800+7200+92210+72*520+5*0*0*0=0=0=0


In [14]:
torch.save(model.state_dict(), 'model_step_85.pth')