In [None]:
#|default_exp tmp

In [None]:
#by @christopherthomas’

In [None]:
#!git clone https://github.com/fastai/course22p2

In [None]:
#%cd course22p2

In [None]:
#!pip install -e .

# Challange

In [None]:
import pickle,gzip,math,os,time,shutil,torch,random
import fastcore.all as fc,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from collections.abc import Mapping
from pathlib import Path
from operator import attrgetter,itemgetter
from functools import partial
from copy import copy
from contextlib import contextmanager

import torchvision.transforms.functional as TF,torch.nn.functional as F
from torch import tensor,nn,optim
from torch.utils.data import DataLoader,default_collate
from torch.nn import init
from torch.optim import lr_scheduler
from torcheval.metrics import MulticlassAccuracy
from datasets import load_dataset,load_dataset_builder

from miniai.datasets import *
from miniai.conv import *
from miniai.learner import *
from miniai.activations import *
from miniai.init import *
from miniai.sgd import *
from miniai.resnet import *

In [None]:
from fastcore.test import test_close
from torch import distributions

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

import logging
logging.disable(logging.WARNING)

set_seed(42)

if fc.defaults.cpus>8: fc.defaults.cpus=8

In [None]:
p = 0.1
dist = distributions.binomial.Binomial(probs=1-p)
dist.sample((10,))

tensor([1., 1., 1., 1., 1., 1., 0., 1., 1., 1.])

In [None]:
class Dropout(nn.Module):
    def __init__(self, p=0.1):
        super().__init__()
        self.p = p

    def forward(self, x):
        if not self.training: return x
        dist = distributions.binomial.Binomial(tensor(1.0).to(x.device), probs=1-self.p)
        return x * dist.sample(x.size()) * 1/(1-self.p)

In [None]:
class TTD_CB(Callback):
    def before_epoch(self, learn):
        learn.model.apply(lambda m: m.train() if isinstance(m, (nn.Dropout,nn.Dropout2d)) else None)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F  #(uncomment if needed,but you likely already have it)

#Mish - "Mish: A Self Regularized Non-Monotonic Neural Activation Function"
#https://arxiv.org/abs/1908.08681v1
#implemented for PyTorch / FastAI by lessw2020 
#github: https://github.com/lessw2020/mish

class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        #inlining this saves 1 second per epoch (V100 GPU) vs having a temp x and then returning x(!)
        return x *( torch.tanh(F.softplus(x)))

In [None]:
from typing import Iterator
from torch.utils.data import DataLoader, WeightedRandomSampler

class TopLossesCallback(Callback):
  epoch_preds = []
  epoch_targets = []
  
  @torch.no_grad()
  def _calculate_top_losses(self):
    preds = torch.cat(self.epoch_preds, dim=0)
    targets = torch.cat(self.epoch_targets, dim=0)
    losses = F.cross_entropy(preds, targets, reduce=False)
    return torch.topk(losses, preds.shape[0]).indices

  def after_batch(self, learn):
    if not learn.model.training:
      return
    self.epoch_preds.append(learn.preds)
    self.epoch_targets.append(learn.batch[1])

  def before_epoch(self, learn):
    self.epoch_preds = []
    self.epoch_targets = []

  def after_epoch(self, learn):
    if not learn.model.training:
      return
    learn.dls.train.sampler.top_losses = self._calculate_top_losses()

# tweaked from tommyc's version
# Before certain epoch drop a % of the training dataset with the lowest losses.
# Replace them with the the same % of the training dataset with the highest losses.
# This gives the model two opportunities to train on the most challenging images.
class CustomTrainingSampler(WeightedRandomSampler):
  def __init__(self, *args, **kwargs):
    WeightedRandomSampler.__init__(self, *args, **kwargs)
    self.data_indexes_for_epoch = []
    self.top_losses = []
    self.epoch = -1
    self.n = {
        0: 0,
        1: 0.2,
        2: 0.4,
        3: 0.2,
        4: 0
    }

  def __iter__(self) -> Iterator[int]:
      self.epoch += 1
      rand_tensor = torch.randperm(self.num_samples, generator=self.generator).tolist()
      n = int(self.n[self.epoch] * self.num_samples)

      if n != 0:
        # TODO: Cleanup the code below
        inverted_losses_for_epochs = torch.flip(torch.tensor(self.top_losses.clone().detach()[:n]), dims=(0,)).cpu()
        self.data_indexes_for_epoch = torch.tensor(self.data_indexes_for_epoch).cpu()
        self.data_indexes_for_epoch[self.top_losses[-n:].cpu()] = self.data_indexes_for_epoch[inverted_losses_for_epochs]
        self.data_indexes_for_epoch = self.data_indexes_for_epoch[rand_tensor]
        self.data_indexes_for_epoch = self.data_indexes_for_epoch.tolist()
      else:
        self.data_indexes_for_epoch = rand_tensor

      yield from self.data_indexes_for_epoch


class CustomDataLoader:
    def __init__(self, *dls): 
      self.train,self.valid = dls[:2]

    def get_sampler(num_samples, mode="train"):
      if mode != "train":
        return None
      return CustomTrainingSampler(weights=[1 for _ in range(num_samples)], num_samples=num_samples)

    @classmethod
    def from_dd(cls, dd, batch_size, as_tuple=True, **kwargs):
      return cls(*[DataLoader(ds, batch_size, sampler=cls.get_sampler(len(ds), mode), collate_fn=collate_dict(ds), **kwargs) for mode, ds in dd.items()])

In [None]:
xl,yl = 'image','label'
name = "fashion_mnist"
bs = 256
xmean,xstd = 0.28, 0.35

@inplace
def transformi(b): b[xl] = [(TF.to_tensor(o)-xmean)/xstd for o in b[xl]]

dsd = load_dataset(name)

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from miniai.mps import cache_dataset_as_dict



tds = cache_dataset_as_dict(dsd.with_transform(transformi))


In [None]:
from miniai.utils import SpeedStatsCB

In [None]:
set_seed(42)
lr,epochs = 6e-2,5

In [None]:
def get_model7(act=nn.ReLU, nfs=(16,32,64,128,256,512), norm=nn.BatchNorm2d):
    layers = [ResBlock(1, 16, ks=5, stride=1, act=act, norm=norm)]
    layers += [ResBlock(nfs[i], nfs[i+1], act=act, norm=norm, stride=2) for i in range(len(nfs)-1)]
    layers += [nn.Dropout2d(0.2), nn.Flatten(), nn.Linear(nfs[-1], 10, bias=False), nn.BatchNorm1d(10)]
    return nn.Sequential(*layers).to(def_device)

In [None]:
from torchvision import transforms

In [None]:
def tfm_batch(b, tfm_x=fc.noop, tfm_y = fc.noop): return tfm_x(b[0]),tfm_y(b[1])

In [None]:
tfms = nn.Sequential(transforms.RandomCrop(28, padding=1),
                     transforms.RandomHorizontalFlip())
augcb = BatchTransformCB(partial(tfm_batch, tfm_x=tfms), on_val=False)

In [None]:
class CapturePreds(Callback):
    def before_fit(self, learn): self.all_preds,self.all_targs = [],[]
    def after_batch(self, learn):
        self.all_preds.append(to_cpu(learn.preds))
        self.all_targs.append(to_cpu(learn.batch[1]))
    def after_fit(self, learn): self.all_preds,self.all_targs = torch.cat(self.all_preds),torch.cat(self.all_targs)

In [None]:
@fc.patch
def capture_preds(self: Learner, cbs=None):
    cp = CapturePreds()
    self.fit(1, train=False, cbs=[cp]+fc.L(cbs))
    return cp.all_preds,cp.all_targs

In [None]:
# tweaked from rohitgeo's version
metrics = MetricsCB(accuracy=MulticlassAccuracy())
astats = ActivationStats(fc.risinstance(Mish))
cbs = [SpeedStatsCB(), DeviceCB(), metrics, TopLossesCallback(), ProgressCB(plot=True), astats]

# #0.0003 from https://github.com/digantamisra98/Mish/issues/37

In [None]:
#xdls = CustomDataLoader.from_dd(tds, bs, num_workers=0)


In [None]:
act_gr = nn.ReLU
iw = partial(init_weights, leaky=0.0003)

set_seed(42)
epochs = 5
m = 8
lr = 1e-2 *m
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
model = get_model7(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
act_gr = partial(GeneralRelu, leak=0.1, sub=0.4)
iw = partial(init_weights, leaky=0.1)
#iw = partial(init_weights, leaky=0.0003)

set_seed(42)
epochs = 1
m = 8
lr = 1e-2 *m
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
model = get_model7(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
act_gr = nn.Hardswish
iw = partial(init_weights, leaky=0.1)
#iw = partial(init_weights, leaky=0.0003)

set_seed(42)
epochs = 1
m = 8
lr = 1e-2 *m
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
model = get_model7(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
act_gr = nn.Hardswish
iw = partial(init_weights, leaky=0.1)
#iw = partial(init_weights, leaky=0.0003)

set_seed(42)
epochs = 5
m = 8
lr = 1e-2 *m
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
model = get_model7(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
from miniai.init_lsuv import LSUVInit

act_gr = nn.Hardswish
iw = partial(init_weights, leaky=0.000)
#iw = partial(init_weights, leaky=0.0003)

set_seed(42)
epochs = 5
m = 8
lr = 1e-2 *m
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
xtra += [LSUVInit(skip_last=1)]
model = get_model7(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
LSUVInit??

In [None]:
act_gr = partial(GeneralRelu, leak=0.1, sub=0.4)
iw = partial(init_weights, leaky=0.1)
#iw = partial(init_weights, leaky=0.0003)

set_seed(42)
epochs = 5
m = 8
lr = 1e-2 *m
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
model = get_model7(act_gr, norm=nn.BatchNorm2d).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, 
                     opt_func=partial(optim.AdamW,  ))
learn.fit(epochs)

In [None]:
import timm

set_seed(42)
epochs = 5
lr = 1e-2
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
model = timm.create_model('resnet18d', in_chans=1, num_classes=10)
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
iw = partial(init_weights, leaky=0.0)

set_seed(42)
epochs = 5
lr = 1e-2
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
model = timm.create_model('resnet18d', in_chans=1, num_classes=10).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
def act_gr(inplace=None): return GeneralRelu(leak=0.1, sub=0.4)
set_seed(42)
iw = partial(init_weights, leaky=0.1)

model = timm.create_model('resnet18d', in_chans=1, num_classes=10, act_layer=act_gr).apply(iw)

epochs = 5
lr = 1e-2
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
def act_gr(inplace=None): return GeneralRelu(leak=0.1, sub=0.4)
set_seed(42)
iw = partial(init_weights, leaky=0.1)

model = timm.create_model('resnet18d', in_chans=1, num_classes=10, act_layer=act_gr).apply(iw)

epochs = 5
m = 8
lr = 1e-2 * m / 4
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)

tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
def act_gr(inplace=None): return GeneralRelu(leak=0.1, sub=0.4)
set_seed(42)
iw = partial(init_weights, leaky=0.1)

model = timm.create_model('resnet34d', in_chans=1, num_classes=10, act_layer=act_gr).apply(iw)

epochs = 5
m = 8
lr = 1e-2 * m / 4
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)

tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
def act_gr(inplace=None): return GeneralRelu(leak=0.1, sub=0.4)
set_seed(42)
iw = partial(init_weights, leaky=0.1)

model = timm.create_model('resnet50d', in_chans=1, num_classes=10, act_layer=act_gr).apply(iw)

epochs = 5
m = 8
lr = 1e-2 * m / 8
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)

tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
# efficient net

In [None]:
timm.list_models('effici*')

In [None]:

# def act_gr(inplace=None): return GeneralRelu(leak=0.1, sub=0.4)
set_seed(42)
# iw = partial(init_weights, leaky=0.1)

model = timm.create_model('efficientnet_lite0', in_chans=1, num_classes=10, )#.apply(init_weights)

epochs = 5
m = 8
lr = 1e-2 * m /2
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)

tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:

def act_gr(inplace=None): return GeneralRelu(leak=0.1, sub=0.4)
set_seed(42)
iw = partial(init_weights, leaky=0.1)

model = timm.create_model('efficientnet_lite0', in_chans=1, num_classes=10, act_layer=act_gr 
                         ).apply(iw)

epochs = 5
m = 8
lr = 1e-2 * m /4
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)

tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
model

# convnext_base

In [None]:
timm.list_models('convnext*')

In [None]:
set_seed(42)
model = timm.create_model('convnext_pico', in_chans=1, num_classes=10, output_stride=16)

epochs = 5
m = 8
lr = 1e-2 * m / 8
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
xtra += [LSUVInit(skip_last=1)]
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
set_seed(42)
model = timm.create_model('convnext_pico', in_chans=1, num_classes=10, output_stride=16).apply(iw)

epochs = 5
m = 8
lr = 1e-2 * m / 8
dls = DataLoaders.from_dd(tds, bs*m, num_workers=0)
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
#xtra += [LSUVInit(skip_last=1)]
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
set_seed(42)
model = timm.create_model('convnext_nano', in_chans=1, num_classes=10, output_stride=16)

epochs = 5
lr = 1e-2
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
[i for i in str(model).split('\n') if 'stride' in i and not 'stride=(1, 1)' in i] 

In [None]:
set_seed(42)
model = timm.create_model('convnext_base', in_chans=1, num_classes=10, output_stride=16)

epochs = 5
lr = 1e-2
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
# Levit

# Levit

In [None]:
timm.list_models('levit*')

In [None]:
def act_gr(inplace=None): return GeneralRelu(leak=0.1, sub=0.4)
set_seed(42)
iw = partial(init_weights, leaky=0.1)

model = timm.create_model('levit_128', in_chans=1, num_classes=10)

epochs = 5
lr = 1e-2
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
model.__init__??

In [None]:
model

In [None]:
def act_gr(inplace=None): return GeneralRelu(leak=0.1, sub=0.4)
set_seed(42)
iw = partial(init_weights, leaky=0.1)

model = timm.create_model('levit', in_chans=1, num_classes=10)

epochs = 5
lr = 1e-2
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]

learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
model.__init__??

In [None]:
xtra += [LSUVInit(skip_last=1)]

In [None]:
iw = partial(init_weights, leaky=0.0)

set_seed(42)
epochs = 5
lr = 1e-2
tmax = epochs * len(dls.train)
sched = partial(lr_scheduler.OneCycleLR, max_lr=lr, total_steps=tmax)
xtra = [BatchSchedCB(sched), augcb]
model = timm.create_model('resnet18d', in_chans=1, num_classes=10).apply(iw)
learn = TrainLearner(model, dls, F.cross_entropy, lr=lr, cbs=cbs+xtra, opt_func=optim.AdamW)
learn.fit(epochs)

In [None]:
ap1, at = learn.capture_preds()

In [None]:
ttacb = BatchTransformCB(partial(tfm_batch, tfm_x=TF.hflip), on_val=True)
ap2, at = learn.capture_preds(cbs=[ttacb])

In [None]:
ap = torch.stack([ap1,ap2]).mean(0).argmax(1)
round((ap==at).float().mean().item(), 3)