In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
np.__version__

'1.19.2'

In [3]:
#export
from exp.nb_06 import *

# ConvNet

In [4]:
x_train, y_train, x_valid, y_valid = get_data()

In [5]:
x_train.mean(), x_train.std()

(tensor(-9.9649e-10), tensor(1.0000))

In [6]:
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)

In [7]:
nh, bs = 50, 512
c = y_train.max().item() + 1
loss_func = F.cross_entropy

In [8]:
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)

In [9]:
mnist_view = view_tfm(1,28,28)
cbfs = [Recorder,
        CudaCallback,
       partial(AvgStatsCallback, accuracy),
       partial(BatchTransformXCallback, mnist_view)]

In [10]:
nfs = [8, 16, 32, 64, 64]

In [11]:
learn, run = get_learn_run(nfs, data, 0.5, conv_layer, cbs=cbfs)

In [12]:
%time run.fit(2, learn)

train: [0.7427634765625, tensor(0.7600, device='cuda:0')]
valid: [0.2911787109375, tensor(0.9130, device='cuda:0')]
train: [0.14137265625, tensor(0.9566, device='cuda:0')]
valid: [0.108356396484375, tensor(0.9647, device='cuda:0')]
CPU times: user 2.61 s, sys: 471 ms, total: 3.08 s
Wall time: 2.75 s


In [13]:
class BatchNorm(nn.Module):
    def __init__(self, nf, mom=0.1, eps=1e-5):
        super().__init__()
        self.mom, self.eps = mom, eps
        self.mults = nn.Parameter(torch.ones(nf, 1, 1))
        self.adds = nn.Parameter(torch.zeros(nf, 1, 1))
        self.register_buffer('means', torch.zeros(1,nf,1,1))
        self.register_buffer('vars', torch.ones(1,nf,1,1))
        
    def update_stats(self, x):
        m = x.mean((0,2,3), keepdim=True)
        v = x.var((0,2,3), keepdim=True)
        self.means.lerp_(m, self.mom)
        self.vars.lerp_(v, self.mom)
        return m, v
    
    def forward(self, x):
        if self.training:
            with torch.no_grad(): m,v = self.update_stats(x)
        else: m, v = self.means, self.vars
        x = (x - m) / (v + self.eps).sqrt()
        return x * self.mults + self.adds

In [14]:
def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, stride, padding=ks//2, bias=not bn),
             GeneralRelu(**kwargs)]
    if bn: layers.append(BatchNorm(nf))
    return nn.Sequential(*layers)

In [15]:
def get_learn_run(nfs, data, lr, layer, cbs=None, opt_func=None, uniform=False, **kwargs):
    model = get_cnn_model(data, nfs, layer)
    init_cnn(model, uniform=uniform)
    return get_runner(model, data, lr, cbs, opt_func)

In [16]:
def init_cnn(m, uniform=False):
    f = init.kaiming_uniform_ if uniform else init.kaiming_normal_
    init_cnn_(m, f)

In [17]:
def init_cnn_(m, f):
    if isinstance(m, nn.Conv2d):
        f(m.weight, a=0.1)
        if getattr(m, 'bias', None) is not None: m.bias.data.zero_()
    for l in m.children(): init_cnn_(l, f)

In [18]:
learn, run = get_learn_run(nfs, data, 0.9, conv_layer, cbfs)

In [19]:
learn.model

Sequential(
  (0): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2), bias=False)
    (1): GeneralRelu()
    (2): BatchNorm()
  )
  (1): Sequential(
    (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): GeneralRelu()
    (2): BatchNorm()
  )
  (2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): GeneralRelu()
    (2): BatchNorm()
  )
  (3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): GeneralRelu()
    (2): BatchNorm()
  )
  (4): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): GeneralRelu()
    (2): BatchNorm()
  )
  (5): AdaptiveAvgPool2d(output_size=1)
  (6): Lambda()
  (7): Linear(in_features=64, out_features=10, bias=True)
)

In [None]:
Hooks??

In [None]:
get_learn_run??

In [None]:
with Hooks(learn.model, append_stats) as hooks:
    run.fit(1, learn)
    
    fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(10,4))
    for h in hooks[:-1]:
        ms, ss = h.stats
        ax0.plot(ms[:10])
        ax1.plot(ss[:10])
        h.remove()
    plt.legend(range(6));
    
    fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(10,4))
    for h in hooks[:-1]:
        ms, ss, = h.stats
        ax0.plot(ms)
        ax1.plot(ss)
    plt.legend(range(6))

In [None]:
learn, run = get_learn_run(nfs, data, 1.0, conv_layer, cbfs)

In [24]:
%time run.fit(3, learn)

train: [0.24103836263020834, tensor(0.9240, device='cuda:0')]
valid: [0.126695947265625, tensor(0.9594, device='cuda:0')]
train: [0.0818512939453125, tensor(0.9747, device='cuda:0')]
valid: [0.5217087890625, tensor(0.8484, device='cuda:0')]
train: [0.062210355631510415, tensor(0.9809, device='cuda:0')]
valid: [0.14971971435546874, tensor(0.9535, device='cuda:0')]
CPU times: user 3.92 s, sys: 627 ms, total: 4.55 s
Wall time: 3.95 s


In [25]:
def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, stride, padding=ks//2, bias=not bn),
             GeneralRelu(**kwargs)]
    if bn: layers.append(nn.BatchNorm2d(nf))
    return nn.Sequential(*layers)

In [26]:
learn, run = get_learn_run(nfs, data, 1., conv_layer, cbfs)

In [27]:
%time run.fit(3, learn)

train: [0.20321240234375, tensor(0.9373, device='cuda:0')]
valid: [0.239667431640625, tensor(0.9241, device='cuda:0')]
train: [0.06327790934244791, tensor(0.9797, device='cuda:0')]
valid: [0.07068351440429688, tensor(0.9785, device='cuda:0')]
train: [0.04151160481770833, tensor(0.9872, device='cuda:0')]
valid: [0.07696593627929688, tensor(0.9757, device='cuda:0')]
CPU times: user 4.93 s, sys: 532 ms, total: 5.47 s
Wall time: 4.86 s


In [28]:
sched = combine_scheds([0.3, 0.7], [sched_lin(0.6, 2.), sched_lin(2., 0.1)]) 

In [29]:
learn,run = get_learn_run(nfs, data, 0.9, conv_layer, cbs=cbfs
                          +[partial(ParamScheduler,'lr', sched)])

In [30]:
run.fit(8, learn)

train: [0.21216072591145832, tensor(0.9360, device='cuda:0')]
valid: [0.2795697265625, tensor(0.9163, device='cuda:0')]
train: [0.0718368408203125, tensor(0.9775, device='cuda:0')]
valid: [0.07884379272460937, tensor(0.9727, device='cuda:0')]
train: [0.04805149332682292, tensor(0.9850, device='cuda:0')]
valid: [0.190367919921875, tensor(0.9418, device='cuda:0')]
train: [0.031479264322916665, tensor(0.9901, device='cuda:0')]
valid: [0.04675704650878906, tensor(0.9856, device='cuda:0')]
train: [0.018110154215494792, tensor(0.9947, device='cuda:0')]
valid: [0.03555221252441406, tensor(0.9887, device='cuda:0')]
train: [0.010334288533528645, tensor(0.9973, device='cuda:0')]
valid: [0.03924672241210937, tensor(0.9863, device='cuda:0')]
train: [0.0063715387980143225, tensor(0.9987, device='cuda:0')]
valid: [0.03159510498046875, tensor(0.9898, device='cuda:0')]
train: [0.004243774668375651, tensor(0.9995, device='cuda:0')]
valid: [0.031404281616210936, tensor(0.9897, device='cuda:0')]


# More norms

## Layer norms

In [31]:
class LayerNorm(nn.Module):
    __constants__ = ['eps']
    def __init__(self, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.mult = nn.Parameter(tensor(1.))
        self.add = nn.Parameter(tensor(0.))
        
    def forward(self, x):
        m = x.mean((1,2,3), keepdim=True)
        v = x.var((1,2,3), keepdim=True)
        x = (x - m) / (v + self.eps).sqrt()
        return x * self.mult + self.add

In [32]:
def conv_ln(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, stride, padding=ks//2, bias=True),
             GeneralRelu(**kwargs)]
    if bn: layers.append(LayerNorm())
    return nn.Sequential(*layers)

In [33]:
learn, run = get_learn_run(nfs, data, 0.8, conv_ln, cbfs)

In [34]:
%time run.fit(3, learn)

train: [nan, tensor(0.1251, device='cuda:0')]
valid: [nan, tensor(0.0980, device='cuda:0')]
train: [nan, tensor(0.0987, device='cuda:0')]
valid: [nan, tensor(0.0980, device='cuda:0')]
train: [nan, tensor(0.0987, device='cuda:0')]
valid: [nan, tensor(0.0980, device='cuda:0')]
CPU times: user 4.93 s, sys: 630 ms, total: 5.56 s
Wall time: 4.96 s


# Instance Norm

In [35]:
class InstanceNorm(nn.Module):
    __constants__ = ['eps']
    def __init__(self, nf, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.mult = nn.Parameter(torch.ones(nf,1,1))
        self.adds = nn.Parameter(torch.zeros(nf,1,1))
        
    def forward(self, x):
        m = x.mean((2,3), keepdim=True)
        v = x.var((2,3), keepdim=True)
        x = (x - m) / (v + self.eps).sqrt()
        return x * self.mult + self.adds

In [36]:
def conv_in(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, stride, padding=ks//2, bias=True),
             GeneralRelu(**kwargs)]
    if bn: layers.append(InstanceNorm(nf))
    return nn.Sequential(*layers)

In [37]:
learn, run = get_learn_run(nfs, data, 0.1, conv_in, cbfs)

In [38]:
run.fit(3, learn)

train: [nan, tensor(0.0987, device='cuda:0')]
valid: [nan, tensor(0.0980, device='cuda:0')]
train: [nan, tensor(0.0987, device='cuda:0')]
valid: [nan, tensor(0.0980, device='cuda:0')]
train: [nan, tensor(0.0987, device='cuda:0')]
valid: [nan, tensor(0.0980, device='cuda:0')]


# Fix small batch sizes

In [39]:
data = DataBunch(*get_dls(train_ds, valid_ds, 2), c)

In [40]:
def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),
              GeneralRelu(**kwargs)]
    if bn: layers.append(nn.BatchNorm2d(nf, eps=1e-5, momentum=0.1))
    return nn.Sequential(*layers)

In [41]:
learn,run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs)

In [42]:
%time run.fit(1, learn)

train: [2.3486611979166665, tensor(0.1752, device='cuda:0')]
valid: [13775449.2928, tensor(0.1738, device='cuda:0')]
CPU times: user 1min 36s, sys: 9.43 s, total: 1min 45s
Wall time: 1min 44s


In [43]:
class RunningBatchNorm(nn.Module):
    def __init__(self, nf, mom=0.1, eps=1e-5):
        super().__init__()
        self.mom, self.eps = mom, eps
        self.mults = nn.Parameter(torch.ones(nf,1,1))
        self.adds = nn.Parameter(torch.zeros(nf,1,1))
        self.register_buffer('sums', torch.zeros(1,nf,1,1))
        self.register_buffer('sqrs', torch.zeros(1,nf,1,1))
        self.register_buffer('batch', tensor(0.))
        self.register_buffer('count', tensor(0.))
        self.register_buffer('step', tensor(0.))
        self.register_buffer('dbias', tensor(0.))
     
    def update_stats(self, x):
        bs, nc, *_ = x.shape
        self.sums.detach_()
        self.sqrs.detach_()
        dims = (0,2,3)
        s = x.sum(dims, keepdim=True)
        ss = (x*x).sum(dims, keepdim=True)
        
        c = self.count.new_tensor(x.numel() / nc)
        mom1 = 1 - (1 - self.mom) / math.sqrt(bs-1)
        self.mom1 = self.dbias.new_tensor(mom1)
        self.sums.lerp_(s, self.mom1)
        self.sqrs.lerp_(ss, self.mom1)
        self.count.lerp_(c, self.mom1)
        self.dbias = self.dbias * (1 - self.mom1) + self.mom1
        self.batch += bs
        self.step += 1
    
    def forward(self, x):
        if self.training: self.update_stats(x)
        sums = self.sums
        sqrs = self.sqrs
        c = self.count
        if self.step < 100:
            sums = sums / self.dbias
            sqrs = sqrs / self.dbias
            c = c / self.dbias
        means = sums / c
        vars = (sqrs/c).sub_(means*means)
        
        if bool(self.batch < 20): vars.clamp_min_(0.01)
        x = (x - means).div_((vars.add_(self.eps)).sqrt())
        return x.mul_(self.mults).add_(self.adds)

In [44]:
def conv_rbn(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),
              GeneralRelu(**kwargs)]
    if bn: layers.append(RunningBatchNorm(nf))
    return nn.Sequential(*layers)

In [45]:
learn,run = get_learn_run(nfs, data, 0.4, conv_rbn, cbs=cbfs)

In [46]:
# %time run.fit(1, learn)

In [47]:
data = DataBunch(*get_dls(train_ds, valid_ds, 32), c)

In [48]:
learn,run = get_learn_run(nfs, data, 0.9, conv_rbn, cbs=cbfs
                          +[partial(ParamScheduler,'lr', sched_lin(1., 0.2))])

In [49]:
%time run.fit(1, learn)

train: [0.14209744466145832, tensor(0.9557, device='cuda:0')]
valid: [0.050409323120117186, tensor(0.9843, device='cuda:0')]
CPU times: user 17.4 s, sys: 3.61 s, total: 21 s
Wall time: 20.7 s


In [50]:
class RunningBatchNorm(nn.Module):
    def __init__(self, nf, mom=0.1, eps=1e-5):
        super().__init__()
        self.mom, self.eps = mom, eps
        self.mults = nn.Parameter(torch.ones(nf,1,1))
        self.adds = nn.Parameter(torch.zeros(nf,1,1))
        self.register_buffer('sums', torch.zeros(1,nf,1,1))
        self.register_buffer('sqrs', torch.zeros(1,nf,1,1))
        self.register_buffer('count', tensor(0.))
#         self.register_buffer('step', tensor(0.))
#         self.register_buffer('dbias', tensor(0.))
#         self.register_buffer('batch', tensor(0.))
        self.batch = 0
        self.register_buffer('factor', tensor(0.))
        self.register_buffer('offset', tensor(0.))
     
    def update_stats(self, x):
        bs, nc, *_ = x.shape
        self.sums.detach_()
        self.sqrs.detach_()
        dims = (0,2,3)
        s = x.sum(dims, keepdim=True)
        ss = (x*x).sum(dims, keepdim=True)
#         c = self.count.new_tensor(x.numel() / nc)
        c = s.new_tensor(x.numel() / nc)
    
#         mom1 = 1 - (1 - self.mom) / math.sqrt(bs-1)
        mom1 = s.new_tensor(1 - (1-self.mom) / math.sqrt(bs-1))
#         self.mom1 = self.dbias.new_tensor(mom1)
        self.sums.lerp_(s, mom1)
        self.sqrs.lerp_(ss, mom1)
        self.count.lerp_(c, mom1)
#         self.dbias = self.dbias * (1 - self.mom1) + self.mom1
        self.batch += bs
#         self.step += 1
        means = self.sums / self.count
        varns = (self.sqrs / self.count).sub_(means*means)
        if bool(self.batch < 20): varns.clamp_min_(0.01) 
        self.factor = self.mults / (varns + self.eps).sqrt()
        self.offset = self.adds - means * self.factor
    
    def forward(self, x):
        if self.training: self.update_stats(x)
        return x * self.factor + self.offset

In [51]:
learn,run = get_learn_run(nfs, data, 0.9, conv_rbn, cbs=cbfs
                          +[partial(ParamScheduler,'lr', sched_lin(1., 0.2))])

In [52]:
%time run.fit(1, learn)

train: [0.136450732421875, tensor(0.9576, device='cuda:0')]
valid: [0.04484238891601563, tensor(0.9854, device='cuda:0')]
CPU times: user 14.2 s, sys: 1.8 s, total: 16 s
Wall time: 15.8 s
