In [108]:
import torch
from torch import nn
from torch.nn import functional as F
import math
from minGPT.mingpt.utils import set_seed
import numpy as np
def approx_equals(a, b):
    assert torch.allclose(a, b, 0.0001), str(a) + "!=" + str(b)

In [159]:
import automataBattle
import random
from importlib import reload
reload(automataBattle)
from torch.utils.data import Dataset
class FastLearnAutomataDataset(Dataset):
    def __init__(self, nStates, nSymbols, split, sequenceLen, numSequences):
        self.nStates = nStates
        self.nSymbols = nSymbols
        self.split = split # train/test
        self.vocab_size = nSymbols*nSymbols
        # +1 due to potential carry overflow, but then -1 because very last digit doesn't plug back
        self.block_size = sequenceLen
        
        self.sequenceLen, self.numSequences = sequenceLen, numSequences
        
        '''
        # split up all addition problems into either training data or test data
        num = (10**self.ndigit)**2 # total number of possible combinations
        r = np.random.RandomState(1337) # make deterministic
        perm = r.permutation(num)
        num_test = min(int(num*0.2), 1000) # 20% of the whole dataset, or only up to 1000
        self.ixes = perm[:num_test] if split == 'test' else perm[num_test:]
        '''


    def __len__(self):
        return self.numSequences

    def __getitem__(self, idx):
        
        a = automataBattle.Automata(nStates=self.nStates, symbols=range(self.nSymbols), randomConnect=True)
        a.minimize()
        while a.complexity() != self.nStates:
            a = automataBattle.Automata(nStates=self.nStates, symbols=range(self.nSymbols), randomConnect=True)
            a.minimize()
        X, Y = a.generate(self.sequenceLen+1, lambda: random.choice(range(self.nSymbols)))
        x = torch.tensor(X)
        y = torch.tensor(Y) # predict the output of the Automata
        previous = y[:-1]
        shiftedForwadInputsOne = x[1:]
        outputs = y[1:] # Todo: look into encoding multiple things ("tuple encodings") instead of this gross thing
        xOutput = shiftedForwadInputsOne+previous*self.nSymbols
        yOutput = outputs
        return xOutput, yOutput
        
        '''
        # given a problem index idx, first recover the associated a + b
        idx = self.ixes[idx]
        nd = 10**self.ndigit
        a = idx // nd
        b = idx %  nd
        c = a + b
        render = f'%0{self.ndigit}d%0{self.ndigit}d%0{self.ndigit+1}d' % (a,b,c) # e.g. 03+25=28 becomes "0325028" 
        dix = [int(s) for s in render] # convert each character to its token index
        # x will be input to GPT and y will be the associated expected outputs
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long) # predict the next token in the sequence
        y[:self.ndigit*2-1] = -100 # we will only train in the output locations. -100 will mask loss to zero
        return x, y
        '''

In [264]:

class HelpfulModule(nn.Module):
    def __init__(self):
        super().__init__()
        self._myHyperParams = {}
        
    def __setattr__(self, attr, val):
        super().__setattr__(attr, val) # make sure to call super because torch.nn.Module also overrides this
        simpleTypes = [int, str, float]
        if type(val) in simpleTypes or (type(val) is list and (len(val) == 0 or type(val[0]) in simpleTypes)):
            self._myHyperParams[attr] = val
            
    
    def extra_repr(self):
        return ", ".join([(str(param) + ": " + str(val)) for param, val in self._myHyperParams.items()])

class SoftRELULayer(HelpfulModule):
    def __init__(self, weightLess, offset):
        super().__init__()
        self.weightLess = weightLess
        self.offset = offset
    
    def forward(self, x):
        biggerThan = torch.max(torch.tensor([0.0]), x)
        lessThan = torch.min(torch.tensor([0.0]), x)
        return biggerThan + lessThan*self.weightLess - self.offset
    

# TODO: see if batch norm works for transformers


class BatchedIndexCrossEntropyLoss(HelpfulModule):
    def __init__(self):
        super().__init__()
    
    def forward(self, y, target, rollupLosses=True):
        '''
        torch.gather(input, dim, index) does the following
        out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
        out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
        out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2

        y is [b,L,vocabSize]
        goals is [b,L]
        we want
        out[bi,l] = y[bi,l,goals[bi,l]]
        but that doesn't fit the above pattern.
        To fix this, we can just do
        out[bi,l,k] = y[bi,l,goals[bi,l,k]]
        where k is only ever 0
        so we need to add that axis to goals
        '''
        b,L = target.shape
        values = torch.gather(y, 2, target.view((b,L,1)))
        # Now make it look like b,L
        values = values.view((b,L))
        # Actual pr for those values is 1.0, so
        # -target*x.log()-(1.0-target)*(1.0-x).log()
        # turns into
        res = -values.log()
        # this gives us one loss per (batch, word), usually they just want a single loss value, so this can roll them up if you want
        if rollupLosses: return res.mean()
        else: return res

class BatchedCrossEntropyLoss(HelpfulModule):
    def __init__(self):
        super().__init__()
    
    def forward(self, y, target, rollupLosses=True):
        vals = -target*y.log()-(1.0-target)*(1.0-y).log()
        # sum along not batch axis
        res = vals.sum(axis=2)
        if rollupLosses: return res.mean()
        else: return res
        # -target[i]*log(x[i])-(1-target[i])*log(1-x[i])

class LayerNorm(HelpfulModule):
    def __init__(self, eps=0.01):
        super().__init__()
        self.eps = eps
        self.multiplicitiveWeight = nn.Parameter(torch.tensor(1.0))
        self.additiveWeight = nn.Parameter(torch.tensor(0.0))
        self.nBatches = 0
    
    def forward(self, x):
        mu = x.mean((1,2,3), keepDim=True)
        var = x.var((1,2,3), keepDim=True) # TODO: add correction based on batch size
        normalizedOutput = (x-mu)/torch.max(var, torch.tensor(self.eps))
        return normalizedOutput*self.multiplicitiveWeight+self.additiveWeight

class EmbeddingLayer(HelpfulModule):
    def __init__(self, vocabSize, embeddingDim):
        super().__init__()
        self.vocabSize, self.embeddingDim = vocabSize, embeddingDim
        # Todo: what is good initialization for embeddings?
        self.embeddings = nn.Parameter(torch.normal(0, 1, [vocabSize, embeddingDim]))
    # Inputs should be dimension [batchSize] and they should be integers
    def forward(self, x):
        return self.embeddings[x]
    
class Transformer(HelpfulModule):
    def __init__(self, numHeads, vocabSize, embeddingDim, keyDim, valueDim, hiddenSize, numLayers, **kwargs):
        super().__init__()
        self.numHeads, self.vocabSize, self.embeddingDim, self.keyDim, self.valueDim, self.hiddenSize, self.numLayers = numHeads, vocabSize, embeddingDim, keyDim, valueDim, hiddenSize, numLayers
        n, d, k, v, m = numHeads, embeddingDim, keyDim, valueDim, hiddenSize
        self.n, self.d, self.k, self.v, self.m = n,d,k,v,m
        self.embedding = EmbeddingLayer(vocabSize, embeddingDim)
        self.encodingLayers = nn.Sequential(*[TransformerBlock(n,d,k,v,m,**kwargs) for _ in range(numLayers)])
        self.finalProjection = nn.Parameter(torch.normal(0, 1, [n*d, vocabSize]))
        self.softmax = nn.Softmax(dim=2)
        self.batchedPrLoss = BatchedCrossEntropyLoss()
        self.batchedIndexLoss = BatchedIndexCrossEntropyLoss()
        # TODO: positional encodings
    
    def configure_optimizers(self, config):
        pass
    
    def forward(self, x, targets=None, rollupLosses=True):
        # x is of size [b,L], word integer indices
        if len(x.shape) == 1: # make everythingn work for batch size 1
            x = x.view((1,x.shape[0]))
        b, L = x.shape
        
        n,d = self.n, self.d
        
        embeddedOutputs = self.embedding(x)
        # embeddedOutputs is of size [b,L,d]
        # we need to make it [b,L,n,d], so we can just use expand to make it look that size
        expandedEmbeddedOutputs = embeddedOutputs.view((b,L,1,d)).expand((b, L, n, d))
        # now it's ready to go through the embeddings
        # It's currently dim [b,L,n,d], we need to make it [b,L,vocabSize]
        # For now I will just flatten and then project, so first make it [b,L,n*d]
        flattenedOutputs = expandedEmbeddedOutputs.reshape((b,L,n*d))
        # project to [b,L,vocabSize]
        finalProj = torch.einsum("iv,bli->blv", self.finalProjection, flattenedOutputs)
        # Use softmax to convert to prs
        wordPrs = self.softmax(finalProj)
        loss = None
        if targets is not None:
            if targets.dtype == torch.int64: # fitting to desired word indices
                if len(targets.shape) == 1: # if single batch, expand out
                    targets = targets.view((1, targets.shape[0]))
                loss = self.batchedIndexLoss(wordPrs, targets, rollupLosses=rollupLosses)
            else: # fitting to word prs
                if len(targets.shape) == 2: # if single batch, expand out
                    targets = targets.view((1, targets.shape[0], targets.shape[1]))
                loss = self.batchedPrLoss(wordPrs, targets, rollupLosses=rollupLosses)
        
        return wordPrs, loss
    def configure_optimizers(self, train_config):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name

                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)
                else:
                    decay.add(fpn)


        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer
        

# Transformers work by processing each word in turn

class MultiHeadSelfAttention(HelpfulModule):
    def __init__(self, n, d, k, v):
        super().__init__()
        self.n, self.d, self.kDim, self.vDim = n,d,k,v
        # Todo: compute initialization scaling factors
        # TODO: What about more things than just QKV? Like four or five or something
        self.Q = nn.Parameter(torch.normal(0, 1, [k, d]))
        self.K = nn.Parameter(torch.normal(0, 1, [k, d]))
        self.V = nn.Parameter(torch.normal(0, 1, [v, d]))
        self.Wch = nn.Parameter(torch.normal(0, 1, [d, v]))
        self.softmax = torch.nn.Softmax(dim=1)
    def forward(self, x):
        # x is [b,L,n,d]
        # b is batch size
        # L is sentence length
        # n is num heads
        # d is embedding dimension
        # we need to use Q, K, V to make a qi, ki, vi for each word
        # because we dot qi and kj, they need to be same dim, call this k
        # vi can be any dim, call this v
        # we need [b,L,n,d] -> [b,L,n,k] for qi and ki
        # we need [b,L,n,d] -> [b,L,n,v] for vi
        # [b,L,n,k]
        
        b,L,n,d,kDim,vDim = x.shape[0], x.shape[1], self.n, self.d, self.kDim, self.vDim
        
        q = torch.einsum("kd,blnd->blnk", self.Q, x)
        k = torch.einsum("kd,blnd->blnk", self.K, x)
        v = torch.einsum("vd,blnd->blnv", self.V, x)
        # Normally people just do a massive matrix, but that is quadratic in terms of L, and very wasteful with memory
        # Instead, we will do a loop over each word and do this for each word.
        # It's still quadratic in terms of L for time complexity (and slightly slower than giant matrix, because we are in python), but now linear in terms of space complexity, which is important for GPU space
        inds = torch.tensor(range(L))
        u = torch.zeros([b,L,n,vDim])
        for i in range(L):
            # q is [b,L,n,k]
            # expand it so it looks as k so we can do dot product
            qi = q[:,i,:,:].view((b,1,n,kDim)).expand((b,L,n,kDim))
            # dot product is component wise product and then sum, so just do that
            # scores is now [b,L,n]
            scores = (qi*k).sum(axis=3)/math.sqrt(kDim) # also divide by sqrt(k)
            scores[:,inds>i,:] = np.NINF # mask out words after current word
            scores = self.softmax(scores)
            # scores is [b,L,n], we need to make it look like [b,L,n,1] so we can expand it along last axis 
            scores = scores.view((b,L,n,1)).expand((b,L,n,vDim))
            ui = (scores*v).sum(axis=1)
            u[:,i,:,:] = ui
        # u is [b,L,n,vDim]
        # we want [b,L,n,d]
        return torch.einsum("dv,blnv->blnd", self.Wch, u)         

    
class TransformerBlock(HelpfulModule):
    def __init__(self, n, d, k, v, m, **kwargs):
        super().__init__()
        # input x is [b,n,d]
        # b is batchSize
        # n is number of heads
        # d is embedding dimension
        # k is key size
        # m is hidden layer size
        self.n, self.d, self.k, self.m = n, d, k, m
        self.W1 = nn.Parameter(torch.normal(0, 1, [m, d]))
        self.W2 = nn.Parameter(torch.normal(0, 1, [d, m]))
        self.attention = MultiHeadSelfAttention(n,d,k,v)
        self.layerNorm1 = LayerNorm()
        self.layerNorm2 = LayerNorm()
        self.RELU = SoftRELULayer(**kwargs)
        
    def forward(self, x):
        attentionOut = self.attention(x)
        ui = self.layerNorm1(x+attentionOut) # todo: check to see if layer norm inside res net block is doing weird stuff, since we have a second res net thing below not attached
        # [d,m]x[b,n,d] -> [b,n,m]
        denseOutput = self.RELU(torch.einsum("md,bnd->bnm", self.W1, ui))
        projectedBack = torch.einsum("dm,bnm->bnd", self.W2, denseOutput)
        return self.layerNorm2(ui+denseOutput)
        
    

In [290]:
class DataLoaderSimple(object):
    def __init__(self, datas, batchSize):
        self.datas = datas
        self.batchSize = batchSize
        self.index = 0
    def __len__(self):
        return len(self.datas)
    def __iter__(self):
        return self
    def __next__(self):
        curBatchX = []
        curBatchY = []
        for i in range(self.batchSize):
            x,y = self.datas[i]
            i += 1
            curBatchX.append(x)
            curBatchY.append(y)
        return torch.stack(curBatchX), torch.stack(curBatchY)
import minGPT.mingpt.trainer
reload(minGPT.mingpt.trainer)
from minGPT.mingpt.trainer import Trainer, TrainerConfig
from importlib import reload
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data.dataloader import DataLoader
from tqdm import tqdm
def tryTrain():
    batchSize = 64
    seqLen = 100
    numHeads = 4
    embeddingDim = 16
    keyDim = 16
    valueDim = 16
    vocabSize=4
    hiddenSize=64
    numLayers = 8
    b,L,n,d,k,v = batchSize,seqLen,numHeads,embeddingDim,keyDim,valueDim
    weightLess = 0.5
    offset = 0.5
    model = Transformer(numHeads=numHeads, vocabSize=vocabSize, embeddingDim=embeddingDim, keyDim=keyDim, valueDim=valueDim, hiddenSize=hiddenSize, numLayers=numLayers, weightLess=weightLess, offset=offset)
    
    train_dataset = FastLearnAutomataDataset(nStates=2, nSymbols=2, split='train', sequenceLen=seqLen, numSequences=300000)
    test_dataset = FastLearnAutomataDataset(nStates=2, nSymbols=2, split='test', sequenceLen=seqLen, numSequences=1000)
    
    tconf = TrainerConfig(max_epochs=100, batch_size=batchSize, learning_rate=6e-5,
                          lr_decay=True, warmup_tokens=2048, final_tokens=50*len(train_dataset)*(2+1),
                          num_workers=0)
    optimizer = model.configure_optimizers(tconf)
    trainer = Trainer(model, train_dataset, test_dataset, tconf)
    loader = DataLoaderSimple(train_dataset, batchSize)
    pbar = tqdm(enumerate(loader), total=len(loader))
    for i, (x,y) in pbar:
        logits, loss = model(x, y)
        loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
        model.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), tconf.grad_norm_clip)
        optimizer.step()
        pbar.set_description("epoch: " + str(i) + " loss:" + str(loss))
    x, y = train_dataset[0]
    print(x,y)
    print(trainer.device)
    output, loss = model(x, y)
    loss.backward()
    trainer.train()
    

tryTrain()
    


epoch: 892 loss:tensor(27138.2090, grad_fn=<MeanBackward0>):   0%|          | 893/300000 [00:23<2:08:32, 38.78it/s]


KeyboardInterrupt: 

In [222]:
def testTransformer7():
    b,L,n,d,k,v,vocabSize, hiddenSize, numLayers = 2,3,4,8,6,7,9, 10, 11
    weightLess = 0.5
    offset = 0.5
    numHeads, vocabSize, embeddingDim, keyDim, valueDim, hiddenSize, numLayers = n, vocabSize, d, k, v, hiddenSize, numLayers 
    gpt = Transformer(numHeads=numHeads, vocabSize=vocabSize, embeddingDim=embeddingDim, keyDim=keyDim, valueDim=valueDim, hiddenSize=hiddenSize, numLayers=numLayers, weightLess=weightLess, offset=offset)
    inputs = torch.tensor([[3,4,1],[1,0,2]])
    targets = torch.normal(0, 1, [b,L, vocabSize])
    sm = nn.Softmax(dim=2)
    targets = sm(targets)
    print("targets", targets, targets.shape, targets[0,0], targets[0,0].sum())
    y, losses = gpt(inputs, targets)
    print("y", y, y.shape, (b,L,vocabSize))
    assert(y.shape == (b,L,vocabSize))
    print("losses", losses, losses.shape)
    approx_equals(y.sum(axis=2), torch.ones([b,L]))
    from minGPT.mingpt import model
    from minGPT.mingpt.model import GPT, GPTConfig, GPT1Config
    mconf = GPTConfig(vocabSize, L, n_layer=numLayers, n_head=numHeads, n_embd=embeddingDim)
    model = GPT(mconf)
    from torch.utils.data.dataloader import DataLoader
    data = FastLearnAutomataDataset(nStates=2, nSymbols=2, split='train', sequenceLen=L, numSequences=60)
    print("dats:", data[0], data.sequenceLen)
    loader = DataLoader(data, shuffle=True, pin_memory=True,
                        batch_size=b,
                        num_workers=0)
    for x,y in loader:
        print("datas:", x,y, x.shape, y.shape)
        break
    y2, losses2 = model(x, y)
    y3, losses3 = gpt(x,y)
    print("y2, losses2", y2, y2.shape, losses2, losses2.shape)
    print("y3, losses3", y3, y3.shape, losses3, losses3.shape)
    spooked = y2.view(-1, y2.size(-1))
    print(y2, y2.shape)
    print("spooked", spooked, spooked.shape)
testTransformer7()


targets tensor([[[0.1594, 0.0643, 0.3269, 0.1214, 0.1380, 0.0313, 0.0908, 0.0361,
          0.0319],
         [0.0869, 0.1340, 0.1009, 0.0286, 0.2093, 0.2145, 0.0711, 0.0465,
          0.1082],
         [0.1025, 0.1424, 0.1804, 0.1474, 0.0647, 0.1346, 0.0705, 0.1309,
          0.0266]],

        [[0.4072, 0.0696, 0.1560, 0.0229, 0.0523, 0.2354, 0.0202, 0.0146,
          0.0217],
         [0.0646, 0.1708, 0.0111, 0.0682, 0.0167, 0.0469, 0.5477, 0.0572,
          0.0167],
         [0.0370, 0.0320, 0.1552, 0.1518, 0.0717, 0.1811, 0.0378, 0.2584,
          0.0751]]]) torch.Size([2, 3, 9]) tensor([0.1594, 0.0643, 0.3269, 0.1214, 0.1380, 0.0313, 0.0908, 0.0361, 0.0319]) tensor(1.)
y tensor([[[3.5294e-02, 3.5906e-05, 2.7673e-07, 5.7191e-04, 1.4594e-05,
          1.1280e-01, 1.2557e-07, 9.1254e-03, 8.4216e-01],
         [4.7185e-02, 2.7636e-08, 5.4642e-05, 9.0173e-03, 8.6240e-04,
          9.2805e-01, 1.3350e-02, 3.9484e-06, 1.4714e-03],
         [1.8485e-10, 4.1149e-04, 6.1090e-07, 7.9660e-01

In [205]:
def testCrossEntropy():
    inputs = torch.tensor([[3,4,1],[1,0,2]])
    goals = torch.tensor([[1,2,0],[2,1,1]])
    b,L,n,d,k,v,vocabSize, hiddenSize, numLayers = 2,3,4,8,6,7,9, 10, 11
    weightLess = 0.5
    offset = 0.5
    numHeads, vocabSize, embeddingDim, keyDim, valueDim, hiddenSize, numLayers = n, vocabSize, d, k, v, hiddenSize, numLayers 
    gpt = Transformer(numHeads=numHeads, vocabSize=vocabSize, embeddingDim=embeddingDim, keyDim=keyDim, valueDim=valueDim, hiddenSize=hiddenSize, numLayers=numLayers, weightLess=weightLess, offset=offset)
    y = gpt(inputs)[0]
    print("y", y, y.shape)
    # for goals[b,i] we want to access the value at y[b,i,goals[b,i]]
    # index for goals[b,i] is (b,i,goals[b,i])
    # lets create that index lookup
    
    '''
    torch.gather(input, dim, index) does the following
    out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
    out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
    out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
    
    y is [b,L,vocabSize]
    goals is [b,L]
    we want
    out[bi,l] = y[bi,l,goals[bi,l]]
    but that doesn't fit the above pattern.
    To fix this, we can just do
    out[bi,l,k] = y[bi,l,goals[bi,l,k]]
    where k is only ever 0
    so we need to add that axis to goals
    '''
    values = torch.gather(y, 2, goals.view((b,L,1)))
    
    
    print(values)
    for bi in range(b):
        for l in range(L):
            approx_equals(y[bi,l,goals[bi,l]],values[bi,l])
testCrossEntropy()

y tensor([[[7.1465e-02, 3.2889e-03, 7.0081e-03, 5.4360e-04, 9.1665e-01,
          4.5948e-04, 4.4783e-07, 5.4686e-04, 3.4154e-05],
         [3.5865e-05, 5.2502e-05, 9.9611e-01, 7.6170e-08, 1.2315e-03,
          1.3414e-06, 2.7552e-05, 3.2243e-04, 2.2150e-03],
         [6.8272e-01, 9.1474e-07, 7.7590e-08, 3.2535e-04, 2.5515e-07,
          2.9180e-04, 3.0933e-01, 6.8850e-04, 6.6424e-03]],

        [[6.8272e-01, 9.1474e-07, 7.7590e-08, 3.2535e-04, 2.5515e-07,
          2.9180e-04, 3.0933e-01, 6.8850e-04, 6.6424e-03],
         [1.7251e-06, 1.5701e-07, 1.6659e-08, 9.9988e-01, 8.1252e-08,
          5.6448e-05, 3.0365e-06, 6.0190e-05, 2.8927e-06],
         [1.0939e-05, 3.0040e-03, 1.4954e-04, 2.4149e-04, 1.6799e-04,
          3.0257e-02, 9.6423e-01, 1.7110e-03, 2.2970e-04]]],
       grad_fn=<SoftmaxBackward>) torch.Size([2, 3, 9])
tensor([[[3.2889e-03],
         [9.9611e-01],
         [6.8272e-01]],

        [[7.7590e-08],
         [1.5701e-07],
         [3.0040e-03]]], grad_fn=<GatherBackwar

In [141]:
def testTransformer6():
    b,L,n,d,k,v = 2,3,4,5,6,7
    vocabSize = 10
    embeddingDim = d
    emb = EmbeddingLayer(vocabSize, embeddingDim)
    inputs = torch.tensor([[3,4,1],[1,0,2]])
    print(inputs.shape, inputs)
    print(emb.embeddings)
    e = emb(inputs)
    print(e, e.shape)
testTransformer6()

torch.Size([2, 3]) tensor([[3, 4, 1],
        [1, 0, 2]])
Parameter containing:
tensor([[ 1.1750,  1.5297, -0.8046,  0.9410, -0.4363],
        [-0.6042,  0.6766, -0.9753,  0.7356, -1.0042],
        [-1.1727,  0.5897, -0.2052, -2.0167,  0.1935],
        [ 0.1514, -0.2831, -0.0529,  1.9061, -0.1231],
        [-0.9733,  0.9495,  0.3668,  1.3234,  0.4505],
        [-0.0030,  0.6066, -1.3297, -0.1344, -0.0678],
        [ 1.6767, -0.2011,  0.4436,  0.5443,  0.8576],
        [ 0.8300,  0.7594, -0.9010,  0.4384, -0.2681],
        [-0.5617, -1.4651, -1.0412, -0.4914, -0.4653],
        [-0.9644,  0.8375,  1.8821, -0.8017,  0.1264]], requires_grad=True)
tensor([[[ 0.1514, -0.2831, -0.0529,  1.9061, -0.1231],
         [-0.9733,  0.9495,  0.3668,  1.3234,  0.4505],
         [-0.6042,  0.6766, -0.9753,  0.7356, -1.0042]],

        [[-0.6042,  0.6766, -0.9753,  0.7356, -1.0042],
         [ 1.1750,  1.5297, -0.8046,  0.9410, -0.4363],
         [-1.1727,  0.5897, -0.2052, -2.0167,  0.1935]]],
       gr

In [61]:
def testTransformer5():
    b,L,n,d,k,v = 2,3,4,5,6,7
    attn = MultiHeadSelfAttention(n,d,k,v)
    x = torch.normal(0, 1, [b,L,n,d])
    y = attn(x)
    print("y", y, y.shape)
testTransformer5()

y tensor([[[[ -3.6704,  -9.3205,   4.9568,   0.7842,   1.8210],
          [ -1.8469,  -1.2511,   2.4165,  -8.4936,   2.3370],
          [ -3.7076, -11.2480,  -2.2541,   0.1730,   3.9289],
          [  4.1189,  12.6030,  -2.1638,   0.1501,  -3.8233]],

         [[  1.0092,   3.1251,  -4.8048,  -7.0410,   0.3013],
          [  3.1568,   8.8109,   3.1315,   1.8201,  -2.8351],
          [  3.8105,   5.1023,  -8.9080,   7.9049,  -3.8480],
          [  2.3823,   9.4847,   2.0629,  -4.3741,  -2.1458]],

         [[  1.2555,   2.0134,  -4.1270,   4.3320,  -1.9510],
          [  2.0929,   5.9423,   2.8129,   3.4904,  -2.7486],
          [  3.3003,   4.9170,  -7.2156,   7.2499,  -3.3391],
          [  0.2540,   5.4972,   6.9223,  -9.5532,  -0.1014]]],


        [[[  1.1796,   5.4335,  -1.8405, -11.9477,   1.3111],
          [ -0.0435,  -2.5657,  -1.9573,  -3.5210,  -0.3614],
          [  3.9728,   3.3343,  -5.2243,  18.5582,  -5.8535],
          [  4.3869,  10.8971,  -0.3297,  12.2588,  -4.5315]

In [20]:
def testTransformer4():
    b,L,n,d,k,v = 2,3,4,5,6,7
    inds = torch.tensor(range(L))
    sm = torch.nn.Softmax(dim=1)
    a = torch.normal(0, 1, [b,L,n])
    print(a)
    a[:,inds>1,:] = np.NINF
    print(a)
    print(sm(a))
    print(sm(a)[0,:,1])
    
testTransformer4()

tensor([[[ 0.2370,  0.5129,  0.0639, -3.6355],
         [ 0.8274,  1.5479,  0.1948,  0.0440],
         [ 0.7549,  0.3560, -0.0806, -0.6300]],

        [[ 0.9438,  0.6535,  0.1992, -1.4359],
         [ 0.2629,  0.8505,  1.1858,  0.5235],
         [ 1.4031,  0.4500,  1.0992, -1.3850]]])
tensor([[[ 0.2370,  0.5129,  0.0639, -3.6355],
         [ 0.8274,  1.5479,  0.1948,  0.0440],
         [   -inf,    -inf,    -inf,    -inf]],

        [[ 0.9438,  0.6535,  0.1992, -1.4359],
         [ 0.2629,  0.8505,  1.1858,  0.5235],
         [   -inf,    -inf,    -inf,    -inf]]])
tensor([[[0.3565, 0.2621, 0.4673, 0.0246],
         [0.6435, 0.7379, 0.5327, 0.9754],
         [0.0000, 0.0000, 0.0000, 0.0000]],

        [[0.6639, 0.4509, 0.2716, 0.1235],
         [0.3361, 0.5491, 0.7284, 0.8765],
         [0.0000, 0.0000, 0.0000, 0.0000]]])
tensor([0.2621, 0.7379, 0.0000])


In [29]:
def testTransformer3():
    b, n, d, k, m = 2,3,4,5,6
    set_seed(27)
    a = torch.normal(0, 1, [b,d])
    b = a.view((b,1,d)).expand((b,n,d))
    print(b, b.shape, b.stride(), b.storage())
    x = torch.normal(0, 1, [b,n,d])
    
    
testTransformer3()

tensor([[[ 0.5664, -0.5493,  1.2395,  0.8272],
         [ 0.5664, -0.5493,  1.2395,  0.8272],
         [ 0.5664, -0.5493,  1.2395,  0.8272]],

        [[ 1.1796,  1.9616, -0.0884,  1.1684],
         [ 1.1796,  1.9616, -0.0884,  1.1684],
         [ 1.1796,  1.9616, -0.0884,  1.1684]]]) torch.Size([2, 3, 4]) (4, 0, 1)  0.5664487481117249
 -0.5492674112319946
 1.2395411729812622
 0.8271635174751282
 1.1796386241912842
 1.9616155624389648
 -0.0884392186999321
 1.1684051752090454
[torch.FloatStorage of size 8]


TypeError: normal(): argument 'size' must be tuple of ints, but found element of type Tensor at pos 1

In [17]:
def testEmbeddings():
    nWords, embeddingDim = 3, 4
    embedding = EmbeddingLayer(nWords, embeddingDim)
    inputs = torch.tensor([[0,2,1, 2, 1]]).reshape(5)
    print(inputs)
    print(embedding(inputs).shape)
testEmbeddings()
    
    
    

tensor([0, 2, 1, 2, 1])
torch.Size([5, 4])


In [17]:
def testTransformer2():
    b, n, d, k = 2, 3, 4, 5
    set_seed(27)
    dotQueryKey = torch.normal(0, 1, [b,n,n])
    softmax = torch.nn.Softmax(dim=2)
    queryPrs = softmax(dotQueryKey)
    vh = torch.normal(0, 1, [b,n,k])
    print("qprs", queryPrs)
    print("vh", vh)
    # the ith output is taking sum over j of (queryPrs[b,i,j])*(vh[b,j])
    #                            scalar            vector
    # queryPrs is [b,n,n]
    # vh       is [b,n,k]
    # so         queryPrs[b,i] is of dim n
    #            vh[b]         is of dim [n,k]
    # so         j ranges from 0 to n-1
    # fixing b and i and thinking of this as a small matrix, we do
    # queryPrs = [0.4, 0.6] (n of these)
    #              
    # vh       = [1.2,     3.4,     5.2] (each row is of length k)
    #          = [3.4,     2.3,     1.1] (there are n rows)
    # we do
    #            [0.4*1.2, 0.4*3.4, 0.4*5.2 ]
    #            [0.6*3.4, 0.6*2.3, 0.6*1.1 ]
    # and then we sum them:
    #            [0.4*1.2+0.6*3.4, 0.4*3.4+0.6*2.3, 0.4*5.2+0.6*1.1]
    # In other words, for a given i we dot the ith row of queryPrs[b] (dim n) by each column in vh[b] (vh[b] is [n,k], so each column is dim n)
    # Thus, the output's [b,i,j] value is the ith row of queryPrs[b] dot the jth column of vh[b] 
    # for regular matrix multiplication of A and B, the [i,j]th value is ith row of A dot jth column of B, so this is just regular matrix multiplication.
    # In einsum: torch.einsum("bij,bjk->bik")
    # which means that our output[b,i] is
    summedRows = torch.einsum("bij,bjk->bik", queryPrs, vh)
    for bi in range(b):
        approx_equals(summedRows[bi, 0,0], queryPrs[bi, 0]@vh[bi, :,0])
        approx_equals(summedRows[bi, 0,1], queryPrs[bi, 0]@vh[bi, :,1])
        approx_equals(summedRows[bi, 1,0], queryPrs[bi, 1]@vh[bi, :,0])
        approx_equals(summedRows[bi, 1,1], queryPrs[bi, 1]@vh[bi, :,1])
    print("summedRows", summedRows)
    
    # we are currently [b,n,k],
    # now we need to project res back to a [b,n,d] size
    Wch = torch.normal(0, 1, [d,k])
    
    res = torch.einsum("dk,bnk->bnd", Wch, summedRows)
    for bi in range(b):
        approx_equals(res[bi,0,0], Wch[0]@summedRows[bi, 0,0])
        approx_equals(res[bi,0,1], Wch[0]@summedRows[bi, 0,0])
        approx_equals(res[bi,0,0], Wch[0]@summedRows[bi, 0,0])
        approx_equals(res[bi,0,0], Wch[0]@summedRows[bi, 0,0])
    print(res[0,0,1], Wch[1]@summedRows[0,0])
    
    a = torch.normal(0, 1, [2, 3])
    b = torch.normal(0, 1, [2, 3, 4])
    
testTransformer2()

qprs tensor([[[0.7313, 0.1338, 0.1350],
         [0.1290, 0.5566, 0.3144],
         [0.2761, 0.6204, 0.1035]],

        [[0.1808, 0.0482, 0.7710],
         [0.1420, 0.2864, 0.5716],
         [0.5909, 0.2983, 0.1109]]])
vh tensor([[[ 0.4601,  0.3644, -1.4775,  0.4753, -0.3383],
         [-0.5367,  1.5008, -0.7286,  0.4594,  0.4356],
         [-0.2073, -1.0252, -1.1372,  1.0307,  0.4656]],

        [[-0.8964,  0.5814, -0.9950, -0.9881, -0.1613],
         [ 0.1007,  0.9505,  0.9992, -0.8928,  1.6873],
         [ 0.4901,  0.2179,  0.0329,  0.0506, -0.0541]]])
summedRows tensor([[[ 0.2367,  0.3289, -1.3314,  0.5481, -0.1263],
         [-0.3046,  0.5601, -0.9537,  0.6411,  0.3452],
         [-0.2274,  0.9256, -0.9777,  0.5229,  0.2250]],

        [[ 0.2207,  0.3189, -0.1064, -0.1826,  0.0104],
         [ 0.1817,  0.4793,  0.1637, -0.3671,  0.4293],
         [-0.4452,  0.6512, -0.2862, -0.8445,  0.4019]]])
tensor(-0.9102) tensor(-0.9102)


In [89]:
def testTransformer():
    b, n, d, k = 2, 3, 4, 5
    from minGPT.mingpt.utils import set_seed
    set_seed(27)
    x = torch.normal(0, 1, [b,n,d])
    print("x:", x)
    x
    Q = torch.normal(0, 1, [k,d])
    K = torch.normal(0, 1, [k,d])
    V = torch.normal(0, 1, [k,d])
    # In other words
    print("Q:", Q)
    res = torch.einsum('kd,bnd->bnk', Q, x)
    # Check that it's the same both ways
    approx_equals(Q@(x[0,0]), res[0,0])
    approx_equals(Q@(x[1,0]), res[1,0])
    approx_equals(Q@(x[0,1]), res[0,1])
    approx_equals(Q@(x[1,1]), res[1,1])
    
    q = torch.einsum("kd,bnd->bnk", Q, x)
    k = torch.einsum("kd,bnd->bnk", Q, x)
    v = torch.einsum("kd,bnd->bnk", Q, x)
    
    print("q:", q)
    print("k:", k)
    dotQueryKey = torch.einsum("bij, bkj->bik", q, k)
    print("dq", dotQueryKey.shape, dotQueryKey)
    
    # dotQueryKey[b,i,j] is q[b,i] dot k[b,j]
    for ba in range(b):
        approx_equals(q[ba,0]@k[ba,0], dotQueryKey[ba,0,0])
        approx_equals(q[ba,0]@k[ba,1], dotQueryKey[ba,0,1])
        approx_equals(q[ba,1]@k[ba,0], dotQueryKey[ba,1,0])
        approx_equals(q[ba,1]@k[ba,1], dotQueryKey[ba,1,1])
    
testTransformer()

x: tensor([[[ 1.7650,  0.0664, -0.0706, -0.1672],
         [-0.4266,  1.5005, -0.2636, -1.0210],
         [-1.7975, -0.3770,  0.6140,  0.5948]],

        [[-0.8629, -0.9511, -0.9195, -0.7592],
         [ 0.3197, -0.6699,  1.5661,  0.8074],
         [-1.6036,  0.1696, -0.0308,  0.0434]]])
Q: tensor([[ 1.5008, -0.7286, -0.5098,  0.4431],
        [-0.9389,  1.5772,  1.6559, -0.4713],
        [ 0.4656, -0.8964,  0.5814, -0.9950],
        [ 0.6763,  0.1337,  0.0659,  0.5385],
        [ 0.9992, -0.8928,  1.6873,  0.4901]])
q: tensor([[[ 2.5625, -1.5905,  0.8876,  1.1078,  1.5033],
         [-2.0517,  2.8120, -0.6810, -0.6551, -2.7111],
         [-2.4725,  1.8294, -0.7338, -0.9053, -0.1320]],

        [[-0.4698, -1.8548,  0.6716, -1.1802, -1.9367],
         [ 0.5273,  0.8561,  0.8565,  0.6647,  3.9557],
         [-2.4954,  1.7016, -0.9597, -1.0404, -1.7845]]])
k: tensor([[[ 2.5625, -1.5905,  0.8876,  1.1078,  1.5033],
         [-2.0517,  2.8120, -0.6810, -0.6551, -2.7111],
         [-2.4725, 

In [85]:
from minGPT.mingpt.utils import set_seed
set_seed(27)
a = torch.normal(0, 1, [4, 5])
b = torch.normal(0, 1, [5, 4])
print("a", a)
print("b", b)
print(a[0]@b[:,0])
torch.einsum("ij,jk->ik", a, b)
set_seed(27)
print("later stuff")
a = torch.normal(0, 1, [4, 5])
b = torch.normal(0, 1, [4, 5])
print(a[0]@b[0])
print(a[0]@b[1])
print(a[1]@b[0])
print(a[1]@b[1])
# we want to go from a [nxm],[nxm] to a [n,n]
torch.einsum("ij,kj->ik", a, b)

a tensor([[ 1.7650,  0.0664, -0.0706, -0.1672,  0.0756],
        [-0.4957, -0.8165, -0.0069, -1.7975, -0.3770],
        [ 0.6140,  0.5948, -0.1926,  0.5088,  1.2001],
        [ 1.0033,  0.3197, -0.6699,  1.5661,  0.8074]])
b tensor([[-1.4775,  0.4753, -0.3383, -0.5367],
        [-0.8237, -0.4236,  0.3272, -1.9896],
        [-0.9389,  1.5772,  1.6559, -0.4713],
        [ 0.2374, -0.1400, -1.0862,  0.5188],
        [ 0.6763,  0.1337,  0.0659,  0.5385]])
tensor(-2.5848)
later stuff
tensor(-2.5248)
tensor(-0.3093)
tensor(1.6219)
tensor(1.0495)


tensor([[-2.5248, -0.3093,  2.8159,  0.9808],
        [ 1.6219,  1.0495,  0.2236, -1.1318],
        [-1.8210,  1.7328, -0.6841,  1.3748],
        [-2.6093,  0.8155,  0.2554,  1.1852]])

In [102]:
from minGPT.mingpt.utils import set_seed
set_seed(27)
a = torch.normal(0, 1, [2, 3, 4])
print(a)

softmax(a)[0,1].sum()

tensor([[[ 1.7650,  0.0664, -0.0706, -0.1672],
         [-0.4266,  1.5005, -0.2636, -1.0210],
         [-1.7975, -0.3770,  0.6140,  0.5948]],

        [[-0.8629, -0.9511, -0.9195, -0.7592],
         [ 0.3197, -0.6699,  1.5661,  0.8074],
         [-1.6036,  0.1696, -0.0308,  0.0434]]])


tensor(1.0000)

In [None]:
bn = LayerNorm(10)
inputs = torch.normal(0, 1, [30, 10])
ayy = bn(inputs)
print(ayy.mean(), ayy.std())