In [1]:
%config Completer.use_jedi = False

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
#export
from exp.nb_10b import *

In [4]:
torch.cuda.is_available()

True

In [5]:
#export
import apex.fp16_utils as fp16

# Converting the model to FP16

In [6]:
bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)

In [7]:
def bn_to_float(model):
    if isinstance(model, bn_types): model.float()
    for child in model.children(): bn_to_float(child)
    return model

In [8]:
def model_to_half(model):
    model = model.half()
    return bn_to_float(model)

In [9]:
model = nn.Sequential(nn.Linear(10,30), nn.BatchNorm2d(30), nn.Linear(30,2)).cuda()
model = model_to_half(model)

In [10]:
def check_weights(model):
    for i,t in enumerate([torch.float16, torch.float32, torch.float16]):
        assert model[i].weight.dtype == t
        assert model[i].bias.dtype == t

In [11]:
check_weights(model)

In [12]:
model = nn.Sequential(nn.Linear(10,30), nn.BatchNorm2d(30), nn.Linear(30,2)).cuda()
model = fp16.convert_network(model, torch.float16)
check_weights(model)

# Creating the master copy of parameters

In [13]:
def same_lists(ps1, ps2):
    assert len(ps1) == len(ps2)
    for (p1,p2) in zip (ps1,ps2):
        assert p1.requires_grad == p2.requires_grad
        assert torch.allclose(p1.data.float(), p2.data.float())

In [14]:
def check_grads(m1, m2):
    for p1,p2 in  zip(m1,m2):
        if p1.grad is None: assert p2.grad is None
        else: assert torch.allclose(p1.grad.data, p2.grad.data)

In [15]:
from torch._utils import _unflatten_dense_tensors

In [16]:
#export
from torch.nn.utils import parameters_to_vector

In [17]:
#export
def get_master(opt, flat_master=False):
    model_pgs = [[param for param in pg if param.requires_grad] for pg in opt.param_groups]
    if flat_master:
        master_pgs = []
        for pg in model_pgs:
            mp = parameters_to_vector([param.data.float() for param in pg])
            mp = torch.nn.Parameter(mp, requires_grad=True)
            if mp.grad is None: mp.grad = mp.new(*mp.size())
            master_pgs.append([mp])
    else:
        master_pgs = [[param.clone().float().detach() for param in pg] for pg in model_pgs]
        for pg in master_pgs:
            for param in pg: param.requires_grad_(True)
    return model_pgs, master_pgs

In [18]:
#export
def to_master_grads(model_pgs, master_pgs, flat_master:bool=False)->None:
    for (model_params, master_params) in zip(model_pgs, master_pgs):
        fp16.model_grads_to_master_grads(model_params, master_params, flat_master)

In [19]:
#export
def to_model_params(model_pgs, master_pgs, flat_master:bool=False)->None:
    for (model_params, master_params) in zip(model_pgs, master_pgs):
        fp16.master_params_to_model_params(model_params, master_params, flat_master)

# The main Callback

In [20]:
class MixedPrecision(Callback):
    _order = 99
    def __init__(self, loss_scale=512, flat_master=False):
        assert torch.backends.cudnn.enabled, 'Mixed precision training requires cudnn.'
        self.loss_scale,self.flat_master = loss_scale,flat_master
        
    def begin_fit(self):
        self.run.model = fp16.convert_network(self.model, torch.float16)
        self.model_pgs, self.master_pgs = get_master(self.opt, self.flat_master)
        self.run.opt.param_groups = self.master_pgs
        
    def after_fit(self): self.model.float()
    def begin_batch(self): self.run.xb = self.run.xb.half()
    def after_pred(self): self.run.pred = self.run.pred.float()
    def after_loss(self): self.run.loss *= self.loss_scale
    
    def after_backward(self):
        to_master_grads(self.model_pgs, self.master_pgs, self.flat_master)
        for master_params in self.master_pgs:
            for param in master_params:
                if param.grad is not None: param.grad.div_(self.loss_scale)
                
    def after_step(self):
        self.model.zero_grad()
        to_model_params(self.model_pgs, self.master_pgs, self.flat_master)

In [21]:
path = untar_data(URLs.IMAGENETTE_160)

In [22]:
tfms = [make_rgb, ResizeFixed(128), to_byte_tensor, to_float_tensor]
bs = 64

il = ImageList.from_files(path, tfms=tfms)
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='val'))
ll = label_by_func(sd, parent_labeler, proc_y=CategoryProcessor())
data = ll.to_databunch(bs, c_in=3, c_out=10, num_workers=4)

In [23]:
nfs = [32,64,128,256,512]

In [24]:
def get_learner(nfs, data, lr, layer, loss_func=F.cross_entropy,
                cb_funcs=None, opt_func=adam_opt(), **kwargs):
    model = get_cnn_model(data, nfs, layer, **kwargs)
    init_cnn(model)
    return Learner(model, data, loss_func, lr=lr, cb_funcs=cb_funcs, opt_func=opt_func)

In [25]:
cbfs = [partial(AvgStatsCallback,accuracy),
        ProgressCallback,
        CudaCallback,
        partial(BatchTransformXCallback, norm_imagenette)]

In [26]:
learn = get_learner(nfs, data, 1e-2, conv_layer, cb_funcs=cbfs)

In [27]:
learn.fit(1)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,2.032667,0.353575,2.310427,0.287643,00:12




In [28]:
cbfs = [partial(AvgStatsCallback,accuracy),
        CudaCallback,
        ProgressCallback,
        partial(BatchTransformXCallback, norm_imagenette),
        MixedPrecision]

In [29]:
learn = get_learner(nfs, data, 1e-2, conv_layer, cb_funcs=cbfs)

In [30]:
learn.fit(1)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,1.986136,0.347978,6.497857,0.20535,00:10




In [31]:
test_eq(next(learn.model.parameters()).type(), 'torch.cuda.FloatTensor')

# Dynamic loss scaling

In [32]:
#export
def test_overflow(x):
    s = float(x.float().sum())
    return (s == float('inf') or s == float('-inf') or s != s)

In [33]:
x = torch.randn(512,1024).cuda()

In [34]:
test_overflow(x)

False

In [35]:
x[123,145] = float('inf')
test_overflow(x)

True

In [36]:
%timeit test_overflow(x)

233 µs ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [37]:
%timeit torch.isnan(x).any().item()

227 µs ± 41.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [38]:
#export
def grad_overflow(param_groups):
    for pg in param_groups:
        for param in pg:
            if param.grad is not None:
                s = float(param.grad.data.float().sum())
                if s == float('inf') or s == float('-inf') or s != s: return True
    return False

In [39]:
#export
class MixedPrecision(Callback):
    _order = 99
    def __init__(self, loss_scale=512, flat_master=False, dynamic=True, max_loss_scale=2.**24, div_factor=2., scale_wait=500):
        assert torch.backends.cudnn.enabled, 'Mixed precision training requires cudnn.'
        
        self.flat_master,self.dynamic,self.max_loss_scale = flat_master,dynamic,max_loss_scale
        self.div_factor,self.scale_wait = div_factor,scale_wait
        self.loss_scale = max_loss_scale if dynamic else loss_scale
        
    def begin_fit(self):
        self.run.model = fp16.convert_network(self.model, torch.float16)
        self.model_pgs, self.master_pgs = get_master(self.opt, self.flat_master)
        self.run.opt.param_groups = self.master_pgs
        if self.dynamic: self.count = 0
        
    def begin_batch(self): self.run.xb = self.run.xb.half()
    def after_pred(self): self.run.pred = self.run.pred.float()
    def after_loss(self): 
        if self.in_train: self.run.loss *= self.loss_scale
    
    def after_backward(self):
        if self.dynamic and grad_overflow(self.model_pgs):
            self.loss_scale /= self.div_factor
            self.model.zero_grad()
            return True # skip step and zero_grad
        to_master_grads(self.model_pgs, self.master_pgs, self.flat_master)
        for master_params in self.master_pgs:
            for param in master_params:
                if param.grad is not None: param.grad.div_(self.loss_scale)
        if self.dynamic:
            self.count += 1
            if self.count == self.scale_wait:
                self.count = 0
                self.loss_scale *= self.div_factor
                
    def after_step(self):
        self.model.zero_grad()
        to_model_params(self.model_pgs, self.master_pgs, self.flat_master)

In [40]:
cbfs = [partial(AvgStatsCallback,accuracy),
        CudaCallback,
        ProgressCallback,
        partial(BatchTransformXCallback, norm_imagenette),
        MixedPrecision]

In [41]:
learn = get_learner(nfs, data, 1e-2, conv_layer, cb_funcs=cbfs)

In [42]:
learn.fit(1)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,1.996885,0.34312,2.570356,0.257834,00:12




In [43]:
learn.cbs[-1].loss_scale

32768.0

In [44]:
!python notebook2script.py 10c_fp16.ipynb

Traceback (most recent call last):
  File "/home/sandmann/repo/fastai-course-v3/nbs/dl2/selfmade/notebook2script.py", line 3, in <module>
    import json,fire,re
ModuleNotFoundError: No module named 'fire'
