In [1]:
# make deterministic
from minGPT.mingpt.utils import set_seed
set_seed(42)

In [2]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [158]:
from torch.utils.data import Dataset

class AdditionDataset(Dataset):
    def __init__(self, ndigit, split):
        self.split = split # train/test
        self.ndigit = ndigit
        self.vocab_size = 10 # 10 possible digits 0..9
        # +1 due to potential carry overflow, but then -1 because very last digit doesn't plug back
        self.block_size = ndigit + ndigit + ndigit + 1 - 1
        
        # split up all addition problems into either training data or test data
        num = (10**self.ndigit)**2 # total number of possible combinations
        r = np.random.RandomState(1337) # make deterministic
        perm = r.permutation(num)
        num_test = min(int(num*0.2), 1000) # 20% of the whole dataset, or only up to 1000
        self.ixes = perm[:num_test] if split == 'test' else perm[num_test:]

    def __len__(self):
        return self.ixes.size

    def __getitem__(self, idx):
        # given a problem index idx, first recover the associated a + b
        idx = self.ixes[idx]
        nd = 10**self.ndigit
        a = idx // nd
        b = idx %  nd
        c = a + b
        render = f'%0{self.ndigit}d%0{self.ndigit}d%0{self.ndigit+1}d' % (a,b,c) # e.g. 03+25=28 becomes "0325028" 
        dix = [int(s) for s in render] # convert each character to its token index
        # x will be input to GPT and y will be the associated expected outputs
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long) # predict the next token in the sequence
        y[:self.ndigit*2-1] = -100 # we will only train in the output locations. -100 will mask loss to zero
        return x, y


In [212]:
# create a dataset for e.g. 2-digit addition
ndigit = 1
train_dataset = FactoringDataset(ndigit=ndigit, base=64, split='train')
test_dataset = FactoringDataset(ndigit=ndigit, base=64, split='test')

In [213]:
train_dataset[0]

ValueError: Bases greater than 36 not handled in base_repr.

In [209]:
from minGPT.mingpt.model import GPT, GPTConfig, GPT1Config

# initialize a baby GPT model
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, 
                  n_layer=8, n_head=8, n_embd=64)
model = GPT(mconf)


11/12/2020 22:22:56 - INFO - minGPT.mingpt.model -   number of parameters: 4.024960e+05


In [211]:
from importlib import reload
import json
import os
import minGPT.mingpt.trainer
reload(minGPT.mingpt.trainer)
from minGPT.mingpt.trainer import Trainer, TrainerConfig
# with 1024*4 batch size 1 layer 1 head n_embed=128 stalled out at 0.05 test and 0.25 train
# initialize a trainer instance and kick off training
# 5:11 for epoch at 16
# 01:04 for epoch at 512
# 01:02 for epoch at 2048
# 1:20 for epoch at 2048*4



saveString = f'layer{mconf.n_layer}head{mconf.n_head}emb{mconf.n_embd}{str(type(train_dataset).__name__)}base{train_dataset.base}digit{train_dataset.ndigit}'
lossesPath = saveString + "losses.json"
tconf = TrainerConfig(max_epochs=10000, batch_size=1024, learning_rate=3e-4,
                      lr_decay=True, warmup_tokens=0, final_tokens=50*len(train_dataset)*(ndigit+1),
                      num_workers=0, ckpt_path=saveString, losses_path=lossesPath)
trainer = Trainer(model, train_dataset, test_dataset, tconf, useCuda=True)
while True:
    from IPython.display import clear_output
    def doCallback():
        give_exam(train_dataset, batch_size=1024, max_batches=10)
        give_exam(test_dataset, batch_size=1024, max_batches=10)
    trainer.train(doCallback)
    clear_output(wait=True)
    tconf.warmup_tokens = 0
    

11/12/2020 22:23:19 - INFO - minGPT.mingpt.trainer -   loading from checkpoint layer8head8emb64FactoringDatasetbase16digit2
11/12/2020 22:23:19 - INFO - minGPT.mingpt.trainer -   loading losses from checkpoint layer8head8emb64FactoringDatasetbase16digit2losses.json
epoch 1 iter 2: train loss 2.64284. lr 2.994739e-04: 100%|██████████| 3/3 [00:00<00:00,  6.25it/s]
11/12/2020 22:23:20 - INFO - minGPT.mingpt.trainer -   test loss: 2.608861
11/12/2020 22:23:20 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
  0%|          | 0/3 [00:00<?, ?it/s]

final score: 2/2333 = 0.09% correct
final score: 0/583 = 0.00% correct


epoch 2 iter 2: train loss 2.58420. lr 2.978994e-04: 100%|██████████| 3/3 [00:00<00:00,  6.41it/s]
11/12/2020 22:23:21 - INFO - minGPT.mingpt.trainer -   test loss: 2.547491
11/12/2020 22:23:21 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 3 iter 2: train loss 2.52601. lr 2.952875e-04: 100%|██████████| 3/3 [00:00<00:00,  6.50it/s]
11/12/2020 22:23:21 - INFO - minGPT.mingpt.trainer -   test loss: 2.502897
11/12/2020 22:23:21 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 4 iter 2: train loss 2.50215. lr 2.916565e-04: 100%|██████████| 3/3 [00:00<00:00,  6.53it/s]
11/12/2020 22:23:22 - INFO - minGPT.mingpt.trainer -   test loss: 2.463198
11/12/2020 22:23:22 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 5 iter 2: train loss 2.45072. lr 2.870318e-04: 100%|██████████| 3/3 [00:00<00:00,  6.48it/s]
11/12/2020 22:23:22 - INFO - minGPT.mingpt.trainer -   tes

final score: 18/2333 = 0.77% correct
final score: 3/583 = 0.51% correct


epoch 22 iter 2: train loss 2.10692. lr 1.096620e-04: 100%|██████████| 3/3 [00:00<00:00,  6.38it/s]
11/12/2020 22:23:32 - INFO - minGPT.mingpt.trainer -   test loss: 2.099664
11/12/2020 22:23:32 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 23 iter 2: train loss 2.10657. lr 9.771419e-05: 100%|██████████| 3/3 [00:00<00:00,  6.37it/s]
11/12/2020 22:23:33 - INFO - minGPT.mingpt.trainer -   test loss: 2.086666
11/12/2020 22:23:33 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 24 iter 2: train loss 2.10408. lr 8.613311e-05: 100%|██████████| 3/3 [00:00<00:00,  6.17it/s]
11/12/2020 22:23:33 - INFO - minGPT.mingpt.trainer -   test loss: 2.082760
11/12/2020 22:23:33 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 25 iter 2: train loss 2.10511. lr 7.500000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.14it/s]
11/12/2020 22:23:34 - INFO - minGPT.mingpt.trainer -  

final score: 10/2333 = 0.43% correct
final score: 1/583 = 0.17% correct


epoch 42 iter 2: train loss 2.04001. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.56it/s]
11/12/2020 22:23:43 - INFO - minGPT.mingpt.trainer -   test loss: 2.027187
11/12/2020 22:23:43 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 43 iter 2: train loss 2.02765. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.56it/s]
11/12/2020 22:23:43 - INFO - minGPT.mingpt.trainer -   test loss: 2.022385
11/12/2020 22:23:43 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 44 iter 2: train loss 2.02776. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.52it/s]
11/12/2020 22:23:44 - INFO - minGPT.mingpt.trainer -   test loss: 2.021202
11/12/2020 22:23:44 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 45 iter 2: train loss 2.04191. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.23it/s]
11/12/2020 22:23:45 - INFO - minGPT.mingpt.trainer -  

final score: 16/2333 = 0.69% correct
final score: 1/583 = 0.17% correct


epoch 62 iter 2: train loss 1.92505. lr 2.194944e-04: 100%|██████████| 3/3 [00:00<00:00,  6.66it/s]
11/12/2020 22:23:54 - INFO - minGPT.mingpt.trainer -   test loss: 1.888919
11/12/2020 22:23:54 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 63 iter 2: train loss 1.90549. lr 2.303740e-04: 100%|██████████| 3/3 [00:00<00:00,  6.55it/s]
11/12/2020 22:23:55 - INFO - minGPT.mingpt.trainer -   test loss: 1.879326
11/12/2020 22:23:55 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 64 iter 2: train loss 1.87523. lr 2.406899e-04: 100%|██████████| 3/3 [00:00<00:00,  6.49it/s]
11/12/2020 22:23:55 - INFO - minGPT.mingpt.trainer -   test loss: 1.877223
11/12/2020 22:23:55 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 65 iter 2: train loss 1.87192. lr 2.503696e-04: 100%|██████████| 3/3 [00:00<00:00,  6.48it/s]
11/12/2020 22:23:56 - INFO - minGPT.mingpt.trainer -  

final score: 29/2333 = 1.24% correct
final score: 10/583 = 1.72% correct


epoch 82 iter 2: train loss 1.68702. lr 2.749382e-04: 100%|██████████| 3/3 [00:00<00:00,  6.33it/s]
11/12/2020 22:24:05 - INFO - minGPT.mingpt.trainer -   test loss: 1.671738
11/12/2020 22:24:05 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 83 iter 2: train loss 1.66155. lr 2.675540e-04: 100%|██████████| 3/3 [00:00<00:00,  6.38it/s]
11/12/2020 22:24:06 - INFO - minGPT.mingpt.trainer -   test loss: 1.656869
11/12/2020 22:24:06 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 84 iter 2: train loss 1.68418. lr 2.593453e-04: 100%|██████████| 3/3 [00:00<00:00,  6.21it/s]
11/12/2020 22:24:06 - INFO - minGPT.mingpt.trainer -   test loss: 1.640218
11/12/2020 22:24:06 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 85 iter 2: train loss 1.67717. lr 2.503696e-04: 100%|██████████| 3/3 [00:00<00:00,  6.36it/s]
11/12/2020 22:24:07 - INFO - minGPT.mingpt.trainer -  

final score: 51/2333 = 2.19% correct
final score: 10/583 = 1.72% correct


epoch 102 iter 2: train loss 1.55738. lr 5.438640e-05: 100%|██████████| 3/3 [00:00<00:00,  6.55it/s]
11/12/2020 22:24:16 - INFO - minGPT.mingpt.trainer -   test loss: 1.542955
11/12/2020 22:24:16 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 103 iter 2: train loss 1.56941. lr 4.505050e-05: 100%|██████████| 3/3 [00:00<00:00,  6.57it/s]
11/12/2020 22:24:17 - INFO - minGPT.mingpt.trainer -   test loss: 1.541761
11/12/2020 22:24:17 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 104 iter 2: train loss 1.55510. lr 3.645074e-05: 100%|██████████| 3/3 [00:00<00:00,  6.50it/s]
11/12/2020 22:24:17 - INFO - minGPT.mingpt.trainer -   test loss: 1.534936
11/12/2020 22:24:17 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 105 iter 2: train loss 1.56729. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.27it/s]
11/12/2020 22:24:18 - INFO - minGPT.mingpt.trainer

final score: 47/2333 = 2.01% correct
final score: 8/583 = 1.37% correct


epoch 122 iter 2: train loss 1.54388. lr 4.505050e-05: 100%|██████████| 3/3 [00:00<00:00,  6.41it/s]
11/12/2020 22:24:27 - INFO - minGPT.mingpt.trainer -   test loss: 1.511563
11/12/2020 22:24:27 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 123 iter 2: train loss 1.55829. lr 5.438640e-05: 100%|██████████| 3/3 [00:00<00:00,  6.02it/s]
11/12/2020 22:24:28 - INFO - minGPT.mingpt.trainer -   test loss: 1.513318
epoch 124 iter 2: train loss 1.55328. lr 6.439296e-05: 100%|██████████| 3/3 [00:00<00:00,  6.39it/s]
11/12/2020 22:24:28 - INFO - minGPT.mingpt.trainer -   test loss: 1.512260
epoch 125 iter 2: train loss 1.52786. lr 7.500000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.40it/s]
11/12/2020 22:24:29 - INFO - minGPT.mingpt.trainer -   test loss: 1.510003
11/12/2020 22:24:29 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 126 iter 2: train loss 1.52791. lr 8.613311e-05: 100%|██████████| 3/3 [00:00

final score: 64/2333 = 2.74% correct
final score: 6/583 = 1.03% correct


epoch 142 iter 2: train loss 1.46604. lr 2.675540e-04: 100%|██████████| 3/3 [00:00<00:00,  6.51it/s]
11/12/2020 22:24:38 - INFO - minGPT.mingpt.trainer -   test loss: 1.445410
epoch 143 iter 2: train loss 1.49864. lr 2.749382e-04: 100%|██████████| 3/3 [00:00<00:00,  6.27it/s]
11/12/2020 22:24:39 - INFO - minGPT.mingpt.trainer -   test loss: 1.469734
epoch 144 iter 2: train loss 1.47778. lr 2.814460e-04: 100%|██████████| 3/3 [00:00<00:00,  5.45it/s]
11/12/2020 22:24:39 - INFO - minGPT.mingpt.trainer -   test loss: 1.433300
11/12/2020 22:24:39 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 145 iter 2: train loss 1.47840. lr 2.870318e-04: 100%|██████████| 3/3 [00:00<00:00,  5.60it/s]
11/12/2020 22:24:40 - INFO - minGPT.mingpt.trainer -   test loss: 1.479576
epoch 146 iter 2: train loss 1.48679. lr 2.916565e-04: 100%|██████████| 3/3 [00:00<00:00,  6.43it/s]
11/12/2020 22:24:40 - INFO - minGPT.mingpt.trainer -   test loss: 1.434687
epoch 147 ite

final score: 63/2333 = 2.70% correct
final score: 3/583 = 0.51% correct


epoch 162 iter 2: train loss 1.39447. lr 2.303740e-04: 100%|██████████| 3/3 [00:00<00:00,  6.37it/s]
11/12/2020 22:24:49 - INFO - minGPT.mingpt.trainer -   test loss: 1.371096
11/12/2020 22:24:49 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 163 iter 2: train loss 1.38296. lr 2.194944e-04: 100%|██████████| 3/3 [00:00<00:00,  6.35it/s]
11/12/2020 22:24:50 - INFO - minGPT.mingpt.trainer -   test loss: 1.362366
11/12/2020 22:24:50 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 164 iter 2: train loss 1.36628. lr 2.081273e-04: 100%|██████████| 3/3 [00:00<00:00,  6.40it/s]
11/12/2020 22:24:50 - INFO - minGPT.mingpt.trainer -   test loss: 1.383081
epoch 165 iter 2: train loss 1.37189. lr 1.963525e-04: 100%|██████████| 3/3 [00:00<00:00,  6.34it/s]
11/12/2020 22:24:51 - INFO - minGPT.mingpt.trainer -   test loss: 1.353999
11/12/2020 22:24:51 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64Fact

final score: 99/2333 = 4.24% correct
final score: 9/583 = 1.54% correct


epoch 182 iter 2: train loss 1.35504. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.37it/s]
11/12/2020 22:25:00 - INFO - minGPT.mingpt.trainer -   test loss: 1.312045
11/12/2020 22:25:00 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 183 iter 2: train loss 1.32153. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.41it/s]
11/12/2020 22:25:01 - INFO - minGPT.mingpt.trainer -   test loss: 1.312091
epoch 184 iter 2: train loss 1.35484. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  5.52it/s]
11/12/2020 22:25:01 - INFO - minGPT.mingpt.trainer -   test loss: 1.309867
11/12/2020 22:25:01 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 185 iter 2: train loss 1.34575. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  5.93it/s]
11/12/2020 22:25:02 - INFO - minGPT.mingpt.trainer -   test loss: 1.307514
11/12/2020 22:25:02 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64Fact

final score: 109/2333 = 4.67% correct
final score: 7/583 = 1.20% correct


epoch 202 iter 2: train loss 1.35878. lr 9.771419e-05: 100%|██████████| 3/3 [00:00<00:00,  6.44it/s]
11/12/2020 22:25:11 - INFO - minGPT.mingpt.trainer -   test loss: 1.305374
epoch 203 iter 2: train loss 1.33971. lr 1.096620e-04: 100%|██████████| 3/3 [00:00<00:00,  6.44it/s]
11/12/2020 22:25:12 - INFO - minGPT.mingpt.trainer -   test loss: 1.296928
11/12/2020 22:25:12 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 204 iter 2: train loss 1.33267. lr 1.218928e-04: 100%|██████████| 3/3 [00:00<00:00,  6.41it/s]
11/12/2020 22:25:12 - INFO - minGPT.mingpt.trainer -   test loss: 1.298494
epoch 205 iter 2: train loss 1.32430. lr 1.343207e-04: 100%|██████████| 3/3 [00:00<00:00,  6.55it/s]
11/12/2020 22:25:13 - INFO - minGPT.mingpt.trainer -   test loss: 1.298942
epoch 206 iter 2: train loss 1.32644. lr 1.468586e-04: 100%|██████████| 3/3 [00:00<00:00,  6.50it/s]
11/12/2020 22:25:13 - INFO - minGPT.mingpt.trainer -   test loss: 1.301240
epoch 207 ite

final score: 86/2333 = 3.69% correct
final score: 6/583 = 1.03% correct


epoch 222 iter 2: train loss 1.30737. lr 2.952875e-04: 100%|██████████| 3/3 [00:00<00:00,  6.51it/s]
11/12/2020 22:25:22 - INFO - minGPT.mingpt.trainer -   test loss: 1.271666
11/12/2020 22:25:22 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 223 iter 2: train loss 1.30732. lr 2.978994e-04: 100%|██████████| 3/3 [00:00<00:00,  5.86it/s]
11/12/2020 22:25:23 - INFO - minGPT.mingpt.trainer -   test loss: 1.287003
epoch 224 iter 2: train loss 1.32323. lr 2.994739e-04: 100%|██████████| 3/3 [00:00<00:00,  6.27it/s]
11/12/2020 22:25:23 - INFO - minGPT.mingpt.trainer -   test loss: 1.295130
epoch 225 iter 2: train loss 1.28263. lr 3.000000e-04: 100%|██████████| 3/3 [00:00<00:00,  6.43it/s]
11/12/2020 22:25:24 - INFO - minGPT.mingpt.trainer -   test loss: 1.284902
epoch 226 iter 2: train loss 1.31343. lr 2.994739e-04: 100%|██████████| 3/3 [00:00<00:00,  6.42it/s]
11/12/2020 22:25:24 - INFO - minGPT.mingpt.trainer -   test loss: 1.282009
epoch 227 ite

final score: 112/2333 = 4.80% correct
final score: 6/583 = 1.03% correct


epoch 242 iter 2: train loss 1.27587. lr 1.719125e-04: 100%|██████████| 3/3 [00:00<00:00,  6.36it/s]
11/12/2020 22:25:33 - INFO - minGPT.mingpt.trainer -   test loss: 1.248540
epoch 243 iter 2: train loss 1.25645. lr 1.594186e-04: 100%|██████████| 3/3 [00:00<00:00,  6.41it/s]
11/12/2020 22:25:34 - INFO - minGPT.mingpt.trainer -   test loss: 1.237594
11/12/2020 22:25:34 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 244 iter 2: train loss 1.25846. lr 1.468586e-04: 100%|██████████| 3/3 [00:00<00:00,  6.32it/s]
11/12/2020 22:25:34 - INFO - minGPT.mingpt.trainer -   test loss: 1.222584
11/12/2020 22:25:34 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 245 iter 2: train loss 1.26047. lr 1.343207e-04: 100%|██████████| 3/3 [00:00<00:00,  6.39it/s]
11/12/2020 22:25:35 - INFO - minGPT.mingpt.trainer -   test loss: 1.235701
epoch 246 iter 2: train loss 1.24787. lr 1.218928e-04: 100%|██████████| 3/3 [00:00

final score: 134/2333 = 5.74% correct
final score: 8/583 = 1.37% correct


epoch 262 iter 2: train loss 1.23671. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.47it/s]
11/12/2020 22:25:44 - INFO - minGPT.mingpt.trainer -   test loss: 1.212020
11/12/2020 22:25:44 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 263 iter 2: train loss 1.23952. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.46it/s]
11/12/2020 22:25:44 - INFO - minGPT.mingpt.trainer -   test loss: 1.211789
11/12/2020 22:25:44 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 264 iter 2: train loss 1.22654. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.39it/s]
11/12/2020 22:25:45 - INFO - minGPT.mingpt.trainer -   test loss: 1.209665
11/12/2020 22:25:45 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 265 iter 2: train loss 1.22901. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.47it/s]
11/12/2020 22:25:45 - INFO - minGPT.mingpt.trainer

final score: 101/2333 = 4.33% correct
final score: 9/583 = 1.54% correct


epoch 282 iter 2: train loss 1.21987. lr 1.594186e-04: 100%|██████████| 3/3 [00:00<00:00,  6.40it/s]
11/12/2020 22:25:55 - INFO - minGPT.mingpt.trainer -   test loss: 1.234465
epoch 283 iter 2: train loss 1.20413. lr 1.719125e-04: 100%|██████████| 3/3 [00:00<00:00,  6.40it/s]
11/12/2020 22:25:55 - INFO - minGPT.mingpt.trainer -   test loss: 1.209336
epoch 284 iter 2: train loss 1.22751. lr 1.842526e-04: 100%|██████████| 3/3 [00:00<00:00,  6.65it/s]
11/12/2020 22:25:56 - INFO - minGPT.mingpt.trainer -   test loss: 1.229955
epoch 285 iter 2: train loss 1.23041. lr 1.963525e-04: 100%|██████████| 3/3 [00:00<00:00,  6.39it/s]
11/12/2020 22:25:56 - INFO - minGPT.mingpt.trainer -   test loss: 1.212913
epoch 286 iter 2: train loss 1.23847. lr 2.081273e-04: 100%|██████████| 3/3 [00:00<00:00,  6.45it/s]
11/12/2020 22:25:57 - INFO - minGPT.mingpt.trainer -   test loss: 1.214017
epoch 287 iter 2: train loss 1.22970. lr 2.194944e-04: 100%|██████████| 3/3 [00:00<00:00,  6.42it/s]
11/12/2020 22:25:57

final score: 117/2333 = 5.02% correct
final score: 13/583 = 2.23% correct


epoch 302 iter 2: train loss 1.21445. lr 2.978994e-04: 100%|██████████| 3/3 [00:00<00:00,  6.43it/s]
11/12/2020 22:26:06 - INFO - minGPT.mingpt.trainer -   test loss: 1.217766
epoch 303 iter 2: train loss 1.20480. lr 2.952875e-04: 100%|██████████| 3/3 [00:00<00:00,  6.38it/s]
11/12/2020 22:26:06 - INFO - minGPT.mingpt.trainer -   test loss: 1.208963
epoch 304 iter 2: train loss 1.21461. lr 2.916565e-04: 100%|██████████| 3/3 [00:00<00:00,  6.52it/s]
11/12/2020 22:26:07 - INFO - minGPT.mingpt.trainer -   test loss: 1.249311
epoch 305 iter 2: train loss 1.23285. lr 2.870318e-04: 100%|██████████| 3/3 [00:00<00:00,  6.06it/s]
11/12/2020 22:26:07 - INFO - minGPT.mingpt.trainer -   test loss: 1.203547
epoch 306 iter 2: train loss 1.22115. lr 2.814460e-04: 100%|██████████| 3/3 [00:00<00:00,  6.36it/s]
11/12/2020 22:26:08 - INFO - minGPT.mingpt.trainer -   test loss: 1.208102
epoch 307 iter 2: train loss 1.22065. lr 2.749382e-04: 100%|██████████| 3/3 [00:00<00:00,  6.39it/s]
11/12/2020 22:26:08

final score: 130/2333 = 5.57% correct
final score: 12/583 = 2.06% correct


epoch 322 iter 2: train loss 1.18005. lr 1.096620e-04: 100%|██████████| 3/3 [00:00<00:00,  6.46it/s]
11/12/2020 22:26:17 - INFO - minGPT.mingpt.trainer -   test loss: 1.185714
epoch 323 iter 2: train loss 1.18685. lr 9.771419e-05: 100%|██████████| 3/3 [00:00<00:00,  6.47it/s]
11/12/2020 22:26:17 - INFO - minGPT.mingpt.trainer -   test loss: 1.175491
epoch 324 iter 2: train loss 1.16534. lr 8.613311e-05: 100%|██████████| 3/3 [00:00<00:00,  6.35it/s]
11/12/2020 22:26:18 - INFO - minGPT.mingpt.trainer -   test loss: 1.173310
11/12/2020 22:26:18 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 325 iter 2: train loss 1.14855. lr 7.500000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.56it/s]
11/12/2020 22:26:18 - INFO - minGPT.mingpt.trainer -   test loss: 1.167332
11/12/2020 22:26:18 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 326 iter 2: train loss 1.17090. lr 6.439296e-05: 100%|██████████| 3/3 [00:00

final score: 173/2333 = 7.42% correct
final score: 13/583 = 2.23% correct


epoch 342 iter 2: train loss 1.14934. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.20it/s]
11/12/2020 22:26:28 - INFO - minGPT.mingpt.trainer -   test loss: 1.161005
epoch 343 iter 2: train loss 1.16569. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.37it/s]
11/12/2020 22:26:28 - INFO - minGPT.mingpt.trainer -   test loss: 1.161404
epoch 344 iter 2: train loss 1.16056. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.62it/s]
11/12/2020 22:26:29 - INFO - minGPT.mingpt.trainer -   test loss: 1.163353
epoch 345 iter 2: train loss 1.18048. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.44it/s]
11/12/2020 22:26:29 - INFO - minGPT.mingpt.trainer -   test loss: 1.164088
epoch 346 iter 2: train loss 1.12876. lr 3.645074e-05: 100%|██████████| 3/3 [00:00<00:00,  6.34it/s]
11/12/2020 22:26:30 - INFO - minGPT.mingpt.trainer -   test loss: 1.164714
epoch 347 iter 2: train loss 1.18554. lr 4.505050e-05: 100%|██████████| 3/3 [00:00<00:00,  6.34it/s]
11/12/2020 22:26:30

final score: 165/2333 = 7.07% correct
final score: 11/583 = 1.89% correct


epoch 362 iter 2: train loss 1.14487. lr 2.194944e-04: 100%|██████████| 3/3 [00:00<00:00,  5.58it/s]
11/12/2020 22:26:39 - INFO - minGPT.mingpt.trainer -   test loss: 1.175518
epoch 363 iter 2: train loss 1.17576. lr 2.303740e-04: 100%|██████████| 3/3 [00:00<00:00,  5.92it/s]
11/12/2020 22:26:40 - INFO - minGPT.mingpt.trainer -   test loss: 1.175099
epoch 364 iter 2: train loss 1.15596. lr 2.406899e-04: 100%|██████████| 3/3 [00:00<00:00,  6.55it/s]
11/12/2020 22:26:40 - INFO - minGPT.mingpt.trainer -   test loss: 1.188699
epoch 365 iter 2: train loss 1.17887. lr 2.503696e-04: 100%|██████████| 3/3 [00:00<00:00,  6.05it/s]
11/12/2020 22:26:41 - INFO - minGPT.mingpt.trainer -   test loss: 1.189256
epoch 366 iter 2: train loss 1.20229. lr 2.593453e-04: 100%|██████████| 3/3 [00:00<00:00,  5.51it/s]
11/12/2020 22:26:42 - INFO - minGPT.mingpt.trainer -   test loss: 1.193600
epoch 367 iter 2: train loss 1.12915. lr 2.675540e-04: 100%|██████████| 3/3 [00:00<00:00,  6.00it/s]
11/12/2020 22:26:42

final score: 162/2333 = 6.94% correct
final score: 7/583 = 1.20% correct


epoch 382 iter 2: train loss 1.16670. lr 2.749382e-04: 100%|██████████| 3/3 [00:00<00:00,  6.46it/s]
11/12/2020 22:26:50 - INFO - minGPT.mingpt.trainer -   test loss: 1.175828
epoch 383 iter 2: train loss 1.16933. lr 2.675540e-04: 100%|██████████| 3/3 [00:00<00:00,  6.45it/s]
11/12/2020 22:26:51 - INFO - minGPT.mingpt.trainer -   test loss: 1.184990
epoch 384 iter 2: train loss 1.09083. lr 2.593453e-04: 100%|██████████| 3/3 [00:00<00:00,  6.40it/s]
11/12/2020 22:26:51 - INFO - minGPT.mingpt.trainer -   test loss: 1.161512
epoch 385 iter 2: train loss 1.15724. lr 2.503696e-04: 100%|██████████| 3/3 [00:00<00:00,  6.40it/s]
11/12/2020 22:26:52 - INFO - minGPT.mingpt.trainer -   test loss: 1.139747
11/12/2020 22:26:52 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 386 iter 2: train loss 1.15868. lr 2.406899e-04: 100%|██████████| 3/3 [00:00<00:00,  6.42it/s]
11/12/2020 22:26:52 - INFO - minGPT.mingpt.trainer -   test loss: 1.164320
epoch 387 ite

final score: 219/2333 = 9.39% correct
final score: 13/583 = 2.23% correct


epoch 402 iter 2: train loss 1.13490. lr 5.438640e-05: 100%|██████████| 3/3 [00:00<00:00,  6.10it/s]
11/12/2020 22:27:01 - INFO - minGPT.mingpt.trainer -   test loss: 1.130997
epoch 403 iter 2: train loss 1.10006. lr 4.505050e-05: 100%|██████████| 3/3 [00:00<00:00,  6.50it/s]
11/12/2020 22:27:02 - INFO - minGPT.mingpt.trainer -   test loss: 1.127344
epoch 404 iter 2: train loss 1.09157. lr 3.645074e-05: 100%|██████████| 3/3 [00:00<00:00,  6.28it/s]
11/12/2020 22:27:02 - INFO - minGPT.mingpt.trainer -   test loss: 1.126338
epoch 405 iter 2: train loss 1.08073. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.53it/s]
11/12/2020 22:27:03 - INFO - minGPT.mingpt.trainer -   test loss: 1.125237
epoch 406 iter 2: train loss 1.08809. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.45it/s]
11/12/2020 22:27:03 - INFO - minGPT.mingpt.trainer -   test loss: 1.125956
epoch 407 iter 2: train loss 1.06729. lr 3.000000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.43it/s]
11/12/2020 22:27:04

final score: 250/2333 = 10.72% correct
final score: 17/583 = 2.92% correct


epoch 422 iter 2: train loss 1.11921. lr 4.505050e-05: 100%|██████████| 3/3 [00:00<00:00,  6.21it/s]
11/12/2020 22:27:12 - INFO - minGPT.mingpt.trainer -   test loss: 1.120996
epoch 423 iter 2: train loss 1.06694. lr 5.438640e-05: 100%|██████████| 3/3 [00:00<00:00,  6.23it/s]
11/12/2020 22:27:13 - INFO - minGPT.mingpt.trainer -   test loss: 1.125630
epoch 424 iter 2: train loss 1.07766. lr 6.439296e-05: 100%|██████████| 3/3 [00:00<00:00,  6.40it/s]
11/12/2020 22:27:13 - INFO - minGPT.mingpt.trainer -   test loss: 1.124812
epoch 425 iter 2: train loss 1.10743. lr 7.500000e-05: 100%|██████████| 3/3 [00:00<00:00,  6.44it/s]
11/12/2020 22:27:14 - INFO - minGPT.mingpt.trainer -   test loss: 1.118151
epoch 426 iter 2: train loss 1.10557. lr 8.613311e-05: 100%|██████████| 3/3 [00:00<00:00,  6.38it/s]
11/12/2020 22:27:14 - INFO - minGPT.mingpt.trainer -   test loss: 1.122246
epoch 427 iter 2: train loss 1.07008. lr 9.771419e-05: 100%|██████████| 3/3 [00:00<00:00,  6.50it/s]
11/12/2020 22:27:15

final score: 234/2333 = 10.03% correct
final score: 12/583 = 2.06% correct


epoch 442 iter 2: train loss 1.10624. lr 2.675540e-04: 100%|██████████| 3/3 [00:00<00:00,  5.53it/s]
11/12/2020 22:27:24 - INFO - minGPT.mingpt.trainer -   test loss: 1.126918
epoch 443 iter 2: train loss 1.11807. lr 2.749382e-04: 100%|██████████| 3/3 [00:00<00:00,  5.49it/s]
11/12/2020 22:27:24 - INFO - minGPT.mingpt.trainer -   test loss: 1.129481
epoch 444 iter 2: train loss 1.09457. lr 2.814460e-04: 100%|██████████| 3/3 [00:00<00:00,  5.64it/s]
11/12/2020 22:27:25 - INFO - minGPT.mingpt.trainer -   test loss: 1.134041
epoch 445 iter 2: train loss 1.08715. lr 2.870318e-04: 100%|██████████| 3/3 [00:00<00:00,  6.57it/s]
11/12/2020 22:27:25 - INFO - minGPT.mingpt.trainer -   test loss: 1.138533
epoch 446 iter 2: train loss 1.13210. lr 2.916565e-04: 100%|██████████| 3/3 [00:00<00:00,  6.44it/s]
11/12/2020 22:27:26 - INFO - minGPT.mingpt.trainer -   test loss: 1.147942
epoch 447 iter 2: train loss 1.10229. lr 2.952875e-04: 100%|██████████| 3/3 [00:00<00:00,  6.41it/s]
11/12/2020 22:27:26

final score: 311/2333 = 13.33% correct
final score: 22/583 = 3.77% correct


epoch 462 iter 2: train loss 1.05749. lr 2.303740e-04: 100%|██████████| 3/3 [00:00<00:00,  6.38it/s]
11/12/2020 22:27:35 - INFO - minGPT.mingpt.trainer -   test loss: 1.114174
11/12/2020 22:27:35 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 463 iter 2: train loss 1.07924. lr 2.194944e-04: 100%|██████████| 3/3 [00:00<00:00,  6.39it/s]
11/12/2020 22:27:35 - INFO - minGPT.mingpt.trainer -   test loss: 1.124339
epoch 464 iter 2: train loss 1.06731. lr 2.081273e-04: 100%|██████████| 3/3 [00:00<00:00,  6.41it/s]
11/12/2020 22:27:36 - INFO - minGPT.mingpt.trainer -   test loss: 1.105644
11/12/2020 22:27:36 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64FactoringDatasetbase16digit2
epoch 465 iter 2: train loss 1.05559. lr 1.963525e-04: 100%|██████████| 3/3 [00:00<00:00,  6.39it/s]
11/12/2020 22:27:36 - INFO - minGPT.mingpt.trainer -   test loss: 1.102253
11/12/2020 22:27:36 - INFO - minGPT.mingpt.trainer -   saving layer8head8emb64Fact

KeyboardInterrupt: 

In [182]:
# now let's give the trained model an addition exam
from torch.utils.data.dataloader import DataLoader
from minGPT.mingpt.utils import sample

def give_exam(dataset, batch_size=32, max_batches=-1, showFailures=False, failureCallback=None):
    
    results = []
    loader = DataLoader(dataset, batch_size=batch_size)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        numMask = (y[0]==-100).int().sum() # we mask away the inputs, so sum
        numOutput = y[0].shape[0]-numMask
        d1d2 = x[:, :numMask+1] # +1 because y is shifted one so we actually mask two things
        d1d2d3 = sample(model, d1d2, numOutput).to(trainer.device)
        desiredOutput = y[:,numMask:]
        modelOutput = d1d2d3[:,numMask+1:]
        correct = (desiredOutput == modelOutput).cpu().all(axis=1) 
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            judge = 'YEP!!!' if correct[i] else 'NOPE'
            if not correct[i] and showFailures:
                print(f"GPT claims that {str(x[i].cpu().numpy())} gives {str(d1d2d3[i][1:].cpu().numpy())} actual is {str(y[i].cpu().numpy())}")
            if not correct[i] and failureCallback is not None:
                failureCallback(x[i].cpu().numpy(), y[i].cpu().numpy(), d1d2d3[i][1:].cpu().numpy())
        
        if max_batches >= 0 and b+1 >= max_batches:
            break

    print("final score: %d/%d = %.2f%% correct" % (np.sum(results), len(results), 100*np.mean(results)))

In [191]:
import customDatasets
from importlib import reload
reload(customDatasets)

def toNum(base, arr):
    a = arr
    if type(arr) == torch.tensor:
        a = arr.cpu().numpy()
    val = 0
    for i, x in enumerate(a[::-1]):
        val += x*(base**i)
    return val
        
def multArrays(base, a, b):
    aVal = toNum(base, a)
    bVal = toNum(base, b)
    return aVal*bVal

base = 32
print(datasets.prime(toNum(base, [24, 29])))
print(convertArrayToNum(base, [9,17,31,25]), multArrays(base, [12, 13], [24, 29]))

True
313337 316409


In [202]:
global numPrimes
global numNotPrimes
numPrimes = 0
numNotPrimes = 0
def failureFactor(x, y, networkOutput):
    global numPrimes
    global numNotPrimes
    nDigits = (len(x)+1)//4
    productResult = toNum(base, x[:nDigits*2])
    actualA, actualB = toNum(base, y[-nDigits*2:-nDigits]), toNum(base, y[-nDigits:])
    outputA, outputB = toNum(base, networkOutput[-nDigits*2:-nDigits]), toNum(base, networkOutput[-nDigits:])
    print("acutal:", productResult, "=", actualA, "*", actualB, "output", (outputA*outputB), "=", outputA, "*", outputB)
    diff = productResult-outputA*outputB
    if customDatasets.prime(outputA): numPrimes += 1
    else: numNotPrimes += 1
    if customDatasets.prime(outputB): numPrimes += 1
    else: numNotPrimes += 1
    print("difference:", diff, "relative magnitude:", float(diff)/productResult, "isPrime:", outputA, customDatasets.prime(outputA), outputB, customDatasets.prime(outputB))

In [204]:
give_exam(train_dataset, batch_size=1024, max_batches=10, showFailures=True, failureCallback=failureFactor)
print(numPrimes, numNotPrimes)

GPT claims that [0 0 0 6 0 2 0] gives [ 0  0  6  0  2  0 19] actual is [-100 -100 -100    0    2    0    3]
acutal: 6 = 2 * 3 output 38 = 2 * 19
difference: -32 relative magnitude: -5.333333333333333 isPrime: 2 True 19 True
GPT claims that [ 9 17 31 25 13 15 22] gives [17 31 25 12 13 24 29] actual is [-100 -100 -100   13   15   22   23]
acutal: 313337 = 431 * 727 output 316409 = 397 * 797
difference: -3072 relative magnitude: -0.00980414058984416 isPrime: 397 True 797 True
GPT claims that [ 3  2  4  9  9 29  9] gives [ 2  4  9  6  1 16  9] actual is [-100 -100 -100    9   29    9   29]
acutal: 100489 = 317 * 317 output 100553 = 193 * 521
difference: -64 relative magnitude: -0.0006368856292728557 isPrime: 193 True 521 True
GPT claims that [ 2 31 30 21  3  1 31] gives [31 30 21  8 13 11  9] actual is [-100 -100 -100    3    1   31   21]
acutal: 98261 = 97 * 1013 output 97109 = 269 * 361
difference: 1152 relative magnitude: 0.0117238782426395 isPrime: 269 True 361 False
GPT claims that [ 

In [206]:
global numPrimes, numNotPrimes
numPrimes, numNotPrimes = 0,0
give_exam(test_dataset, batch_size=1024, max_batches=10, showFailures=True, failureCallback=failureFactor)
print(numPrimes, numNotPrimes)

GPT claims that [16 16 13 21 19  9 27] gives [16 13 21 20 13 25  9] actual is [-100 -100 -100   19    9   27   13]
acutal: 541109 = 617 * 877 output 528277 = 653 * 809
difference: 12832 relative magnitude: 0.023714260897527116 isPrime: 653 True 809 True
GPT claims that [20  6 29 29 24  5 26] gives [ 6 29 29 22 15 29 19] actual is [-100 -100 -100   24    5   26   25]
acutal: 662461 = 773 * 857 output 680893 = 719 * 947
difference: -18432 relative magnitude: -0.027823524705605313 isPrime: 719 True 947 True
GPT claims that [16  4  4 17 22 23 22] gives [ 4  4 17 19  5 26 25] actual is [-100 -100 -100   22   23   22   23]
acutal: 528529 = 727 * 727 output 525341 = 613 * 857
difference: 3188 relative magnitude: 0.006031835528419444 isPrime: 613 True 857 True
GPT claims that [ 0 10  8 21  0 13 25] gives [10  8 21  2  3  5  7] actual is [-100 -100 -100    0   13   25    9]
acutal: 10517 = 13 * 809 output 11189 = 67 * 167
difference: -672 relative magnitude: -0.06389654844537415 isPrime: 67 Tru

In [33]:
# n_layer=2, n_head=2, n_embd=128 - gets 100%



# factoring:
# base 16, 2 digits:
# 8 layers, 2 heads stalls out at 0.14311, and also seems to be memorizing because test loss goes down but then back up

# 8 layers, 8 heads, embed 64
# epoch 453 iter 3: train loss 0.98656. lr 5.825030e-04:  80%|████████  | 4/5 [00:00<00:00,  7.09it/s]
# epoch 102 iter 0: train loss 1.47119. lr 1.204984e-04:  20%|██        | 1/5 [00:00<00:00,  9.02it/s]