In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from minGPT.mingpt import model
# make deterministic
from minGPT.mingpt.utils import set_seed
set_seed(42)
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [2]:
import automataBattle
from torch.utils.data import Dataset
class FastLearnAutomataDataset(Dataset):
    def __init__(self, nStates, nSymbols, split, sequenceLen, numSequences):
        self.nStates = nStates
        self.nSymbols = nSymbols
        self.split = split # train/test
        self.vocab_size = nSymbols
        # +1 due to potential carry overflow, but then -1 because very last digit doesn't plug back
        self.block_size = sequenceLen
        
        self.sequenceLen, self.numSequences = sequenceLen, numSequences
        
        '''
        # split up all addition problems into either training data or test data
        num = (10**self.ndigit)**2 # total number of possible combinations
        r = np.random.RandomState(1337) # make deterministic
        perm = r.permutation(num)
        num_test = min(int(num*0.2), 1000) # 20% of the whole dataset, or only up to 1000
        self.ixes = perm[:num_test] if split == 'test' else perm[num_test:]
        '''


    def __len__(self):
        return self.numSequences

    def __getitem__(self, idx):
        
        a = automataBattle.Automata(nStates=self.nStates, symbols=range(self.nSymbols), randomConnect=True)
        a.minimize()
        while a.complexity() != self.nStates:
            a = automataBattle.Automata(nStates=self.nStates, symbols=range(self.nSymbols), randomConnect=True)
            a.minimize()
        X, Y = a.generate(self.sequenceLen)
        x = torch.tensor(X)
        y = torch.tensor(Y) # predict the output of the Automata
        return x, y
        
        '''
        # given a problem index idx, first recover the associated a + b
        idx = self.ixes[idx]
        nd = 10**self.ndigit
        a = idx // nd
        b = idx %  nd
        c = a + b
        render = f'%0{self.ndigit}d%0{self.ndigit}d%0{self.ndigit+1}d' % (a,b,c) # e.g. 03+25=28 becomes "0325028" 
        dix = [int(s) for s in render] # convert each character to its token index
        # x will be input to GPT and y will be the associated expected outputs
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long) # predict the next token in the sequence
        y[:self.ndigit*2-1] = -100 # we will only train in the output locations. -100 will mask loss to zero
        return x, y
        '''

In [27]:

import minGPT
from importlib import reload
from minGPT.mingpt import trainer
from minGPT.mingpt import model
reload(minGPT.mingpt.model)
reload(minGPT.mingpt.trainer)
from minGPT.mingpt.model import GPT, GPTConfig, GPT1Config
import gc
model = None
train_dataset = None
test_dataset = None
gc.collect()
torch.cuda.empty_cache()
train_dataset = FastLearnAutomataDataset(nStates=2, nSymbols=2, split='train', sequenceLen=500, numSequences=300000)
test_dataset = FastLearnAutomataDataset(nStates=2, nSymbols=2, split='test', sequenceLen=500, numSequences=10)

# initialize a baby GPT model
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, 
                  n_layer=32, n_head=4, n_embd=64)
model = GPT(mconf)
from minGPT.mingpt.trainer import Trainer, TrainerConfig
set_seed(27)

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=100, batch_size=8, learning_rate=6e-5,
                      lr_decay=True, warmup_tokens=8, final_tokens=50*len(train_dataset)*(2+1),
                      num_workers=0)
trainer = Trainer(model, train_dataset, test_dataset, tconf)
trainer.train()
    

10/19/2020 17:29:42 - INFO - minGPT.mingpt.model -   number of parameters: 1.631872e+06
  0%|                                                                                        | 0/37500 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 8.00 GiB total capacity; 6.04 GiB already allocated; 14.41 MiB free; 6.28 GiB reserved in total by PyTorch)