Train/Valid/Test fundamentals:

Train set is used for training the model

Validation set is used for checking if the model is overfitting on the training set. Having training loss lower than validation loss is fine, you are overfitting once your validation loss starts getting worse.

There's a trick called "flooding" where you start doing gradient ascent once your training loss gets too low. I think this makes more sense to start doing once your validation loss starts increasing. It's worth looking into how momentum opts and such should maybe be tweaked for when you do this ascent.

Test set should only be used once, and is used to see if your model can generalize to real world data it hasn't seen before. If you are careful and use differential privacy you can actually use it more than once (about Sqrt(n) times iirc) if you are okay with not getting an accuracy, and instead only getting a bit saying whether accuracy is significantly different from validation performance, see https://arxiv.org/abs/1506.02629

In [None]:
import torch
from torch import nn
import numpy as np

torch.min(torch.tensor([0.0]), torch.normal(0, 10, [10]))

In [None]:
class HelpfulModule(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self._myHyperParams = {}
        
    def __setattr__(self, attr, val):
        super().__setattr__(attr, val) # make sure to call super because torch.nn.Module also overrides this
        simpleTypes = [int, str, float]
        if type(val) in simpleTypes or (type(val) is list and (len(val) == 0 or type(val[0]) in simpleTypes)):
            self._myHyperParams[attr] = val
            
    
    def extra_repr(self):
        return ", ".join([(str(param) + ": " + str(val)) for param, val in self._myHyperParams.items()])
            

class FeedforwardLayer(HelpfulModule):
    def __init__(self, inSize, outSize):
        super().__init__()
        self.inSize = inSize
        self.outSize = outSize
        self.weights = nn.Parameter(torch.normal(0, 1, [inSize, outSize]))
        self.bias = nn.Parameter(torch.normal(0, 1, [outSize]))
    def forward(self, x):
        res = x@self.weights+self.bias
        return res
    

class SoftRELULayer(HelpfulModule):
    def __init__(self, weightLess, offset):
        super().__init__()
        self.weightLess = weightLess
        self.offset = offset
    
    def forward(self, x):
        biggerThan = torch.max(torch.tensor([0.0]), x)
        lessThan = torch.min(torch.tensor([0.0]), x)
        return biggerThan + lessThan*self.weightLess - self.offset

# Uses log-sum-exp trick.
# see https://stats.stackexchange.com/questions/115258/comprehensive-list-of-activation-functions-in-neural-networks-with-pros-cons section on softmax
# This is nice for when you know each data point has exactly one label
# Returns a value from 0-1, and guarantees sum of values is roughly 1.0
class SoftmaxLayer(HelpfulModule):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        maxVal, _ = torch.max(x, dim=1, keepdim=True)
        logValDenominator = maxVal+(x-maxVal).exp().sum(axis=1, keepdim=True).log()
        logValNumerator = x
        return (logValNumerator - logValDenominator).exp()
    
class EmbeddingLayer(HelpfulModule):
    def __init__(self, nClasses, embeddingDim):
        super().__init__()
        self.nClasses, self.embeddingDim = nClasses, embeddingDim
        # Todo: what is good initialization for embeddings?
        self.embeddings = nn.Parameter(torch.normal([nClasses, embeddingDim]))
    
    def forward(self, x):
        return x # TODO: embedding
    
# This is nice for when your data might have more than one label
# returns a value from 0-1, but sum of values might be anything
class SigmoidLayer(HelpfulModule):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return 1.0/(1.0+(-x).exp())
        

class FixupLayer(HelpfulModule):
    def __init__(self, layer, fixupIters, fixupBatchSize, eps=0.01):
        super().__init__()
        assert fixupBatchSize>1, "Fixup batch size needs to be greater than one to compute std"
        self.fixupIters, self.fixupBatchSize = fixupIters, fixupBatchSize
        self.layer = layer
        x = layer.generateInputData(fixupBatchSize)
        layerOutput = layer(x)
        layerOutputShape = list(layerOutput.shape)[1:]
        self.avgStd = torch.ones(layerOutputShape)
        self.avgMean = torch.zeros(layerOutputShape)
        
        for i in range(fixupIters):
            x = layer.generateInputData(fixupBatchSize)
            y = layer(x)
            self.avgStd += y.std(axis=0)
            self.avgMean += y.mean(axis=0)
        
        self.avgStd /= float(fixupIters)
        self.avgMean /= float(fixupIters)
        
        self.avgStd = torch.clamp(self.avgStd, min=eps)
        
        
        # This will ensure every activation has mean 0 std 1
        
    def forward(self, x):
        return (self.layer(x)-self.avgMean)/self.avgStd
            
        
        
class DenseLayer(HelpfulModule):
    def __init__(self, inSize, outSize, act):
        super().__init__()
        self.inSize, self.outSize, self.act = inSize, outSize, act
        self.feedforward = FeedforwardLayer(inSize, outSize)
        
    def forward(self, x):
        return self.act(self.feedforward(x))
    
    def generateInputData(self, bs):
        return torch.normal(0, 1, [bs, self.inSize])
        

In [427]:
a = torch.normal(0, 1, [3, 4, 5, 2])


In [428]:

# Simple math for a single row (don't need to worry about axes)
def softmaxSingleRow(x):
    maxOfRow = torch.max(x)
    denominator = maxOfRow + (x - maxOfRow).exp().sum().log()
    numerator = x
    return (numerator - denominator).exp()

def softmaxTests(debug=False):
    batchSize=5
    def debugPrint(*args, **kwargs):
        if debug: print(*args, **kwargs)
    a = torch.normal(0, 1, [batchSize, 4])
    debugPrint(a)
    maxes = torch.max(a, dim=1, keepdim=True)[0]
    debugPrint(maxes)
    assert(torch.all(a-maxes-0.00001<=0))
    debugPrint(a - maxes)
    debugPrint((a - maxes - (a-maxes).exp().sum(axis=1, keepdim=True).log()).exp().sum(axis=1))
    sm = SoftmaxLayer()
    y = sm(a)
    debugPrint(y)
    debugPrint(y.sum(axis=0), y.sum(axis=0).shape)
    debugPrint(y.sum(axis=1), y.sum(axis=1).shape)
    debugPrint(y[0].sum(), softmaxSingleRow(a[0]).sum())
    debugPrint(y[1].sum(), softmaxSingleRow(a[1]).sum())
    debugPrint(softmaxSingleRow(a[0]), y[0])
    debugPrint(softmaxSingleRow(a[1]), y[1])
    # check to make sure that we are doing the right thing per batch
    approx_equals(softmaxSingleRow(a[0]), y[0])
    approx_equals(softmaxSingleRow(a[1]), y[1])
    # check that each batch summed is roughly 1.0
    approx_equals(y.sum(axis=1), torch.ones([batchSize]))

softmaxTests(False)
    

In [429]:
class SequentialLayer(HelpfulModule):
    def __init__(self, *layers):
        super().__init__()
        self.layers = layers
        for i, layer in enumerate(self.layers): self._modules[str(i)] = layer
    
    def forward(self, x):
        res = x
        for layer in self.layers:
            res = layer(res)
        return res
    
            

In [430]:
class FeedforwardNet(HelpfulModule):
    def __init__(self, inSize, hiddenSizes, outSize, act, finalAct, fixupIters, fixupBs):
        super().__init__()
        self.inSize, self.hiddenSizes, self.outSize, self.act, self.finalAct = inSize, hiddenSizes, outSize, act, finalAct
        allSizes = [inSize] + hiddenSizes + [outSize]
        self.layers = [DenseLayer(allSizes[i], allSizes[i+1], act) if i < len(allSizes)-2 else
                          DenseLayer(allSizes[i], allSizes[i+1], finalAct)
                       for i in range(len(allSizes)-1) ]
        self.layers = SequentialLayer(*[FixupLayer(layer, fixupBs, fixupIters) for layer in self.layers])
    
    def forward(self, x):
        return self.layers(x)
    
        

In [431]:
def batchStats(x):
    return x.mean(axis=0), x.std(axis=0)

In [458]:
batchSize = 100000
inputSize = 12
hiddenSizes = [4,5,6,7]
outputSize = 10
fixupIters = 1000
fixupBs = 2
act = SoftRELULayer(weightLess=0.5, offset=0.5)
finalAct = SoftmaxLayer()
x = torch.normal(0, 1, [batchSize, inputSize])
#net = FeedforwardNet(inputSize, hiddenSizes, outputSize, act, finalAct, fixupIters, fixupBs)
dense = DenseLayer(inputSize, outputSize, finalAct)
net = FixupLayer(dense, fixupIters, fixupBs)
print(net)
denseOutput = dense(x)
y = net(x)
#print("outputs:", y[0], y[1])
print("Stats", batchStats(net(x)))
print("Biases:", net.avgMean, net.avgStd)



FixupLayer(
  fixupIters: 1000, fixupBatchSize: 2
  (layer): DenseLayer(
    inSize: 12, outSize: 10
    (act): SoftmaxLayer()
    (feedforward): FeedforwardLayer(inSize: 12, outSize: 10)
  )
)
Stats (tensor([ 0.0060,  0.0358, -0.0072,  0.0036,  0.0264, -0.0456,  0.0774, -0.0175,
        -0.0044,  0.0612], grad_fn=<MeanBackward1>), tensor([2.1745, 1.9042, 2.0162, 1.6359, 1.9917, 1.4028, 2.6778, 1.9228, 1.5940,
        3.2203], grad_fn=<StdBackward1>))
Biases: tensor([0.0491, 0.0976, 0.0733, 0.1604, 0.0897, 0.2238, 0.0371, 0.0848, 0.1642,
        0.0200], grad_fn=<DivBackward0>) tensor([0.0611, 0.1091, 0.0877, 0.1739, 0.1110, 0.2189, 0.0471, 0.0959, 0.1787,
        0.0264], grad_fn=<ClampBackward>)


In [515]:
def variance(a, **kwargs):
    return (a.pow(2)).mean(**kwargs) + a.mean(**kwargs).pow(2)

def variance2(a, **kwargs):
    return (a-a.mean(keepdim=True, **kwargs)).pow(2).mean(**kwargs)




print(variance2(torch.normal(0, 1, [1000]), axis=0))

a = torch.normal(0, 1, [4, 5])
f = a.mean(axis=0, keepdim=True)
f.pow(2).mean()
print(a[:,0], a[:,0].mean())
print(f.shape, f)
print(a, a - a.mean(keepdim=True, axis=0), (a-a.mean(keepdim=True, axis=0)).pow(2).mean(keepdim=True, axis=0), variance(a, keepdim=True, axis=0), "hhh", variance2(a, axis=0))

variance(a, axis=0).mean(), variance2(a, axis=0).mean()

tensor(0.9983)
tensor([ 0.3440,  1.4775, -0.2170,  1.1752]) tensor(0.6949)
torch.Size([1, 5]) tensor([[ 0.6949,  0.4711,  0.8089, -0.7772,  0.1219]])
tensor([[ 0.3440,  0.7092,  0.6088, -0.6691,  1.2032],
        [ 1.4775, -0.1609,  0.7756, -0.9842, -0.8903],
        [-0.2170,  0.1969,  1.0542, -1.1420,  0.1661],
        [ 1.1752,  1.1392,  0.7972, -0.3135,  0.0085]]) tensor([[-0.3509,  0.2381, -0.2002,  0.1081,  1.0813],
        [ 0.7826, -0.6320, -0.0333, -0.2070, -1.0122],
        [-0.9119, -0.2742,  0.2452, -0.3648,  0.0442],
        [ 0.4803,  0.6681, -0.0118,  0.4637, -0.1133]]) tensor([[0.4495, 0.2444, 0.0254, 0.1007, 0.5522]]) tensor([[1.4154, 0.6883, 1.3341, 1.3088, 0.5819]]) hhh tensor([0.4495, 0.2444, 0.0254, 0.1007, 0.5522])


(tensor(1.0657), tensor(0.2744))

In [279]:
def approx_equals(a, b):
    assert torch.allclose(a, b, 0.0001), str(a) + "!=" + str(b)

In [280]:
def testFeedforward(debug=False):
    def debugPrint(*args, **kwargs):
        if debug: print(*args, **kwargs)
    batchSize = 3
    inputSize = 2
    hiddenSize = 4
    x = torch.normal(0, 1, [batchSize, inputSize])
    debugPrint(x)
    layer = FeedforwardLayer(inputSize, hiddenSize)
    for p in layer.parameters():
        debugPrint(p)
    y = layer(x)
    debugPrint(y)
    weights = layer.weights
    bias = layer.bias
    firstOutputBatch1 = x[0]@layer.weights[:,0]+layer.bias[0]
    secondOutputBatch1 = x[0]@layer.weights[:,1]+layer.bias[1]
    firstOutputBatch2 = x[1]@layer.weights[:,0]+layer.bias[0]
    secondOutputBatch2 = x[1]@layer.weights[:,1]+layer.bias[1]
    debugPrint(x[0],"*",layer.weights[:,0],"+",layer.bias[0], "=", firstOutputBatch1) 
    debugPrint(x[0],"*",layer.weights[:,1],"+",layer.bias[1], "=", secondOutputBatch1)
    debugPrint(x[1],"*",layer.weights[:,0],"+",layer.bias[0], "=", firstOutputBatch2) 
    debugPrint(x[1],"*",layer.weights[:,1],"+",layer.bias[1], "=", secondOutputBatch2) 
    approx_equals(firstOutputBatch1, y[0,0])
    approx_equals(secondOutputBatch1, y[0,1])
    approx_equals(firstOutputBatch2, y[1,0])
    approx_equals(secondOutputBatch2, y[1,1])
    
testFeedforward()