[course video and other materials](https://course.fast.ai/videos/?lesson=8)

# Setting

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
!git clone https://github.com/fastai/course-v3.git

Cloning into 'course-v3'...
remote: Enumerating objects: 5498, done.[K
remote: Total 5498 (delta 0), reused 0 (delta 0), pack-reused 5498[K
Receiving objects: 100% (5498/5498), 258.00 MiB | 31.68 MiB/s, done.
Resolving deltas: 100% (2992/2992), done.
Checking out files: 100% (860/860), done.


In [None]:
%cd /content/course-v3/nbs/dl2/

/content/course-v3/nbs/dl2


In [None]:
from exp.nb_01 import *

In [None]:
MNIST_URL

'http://deeplearning.net/data/mnist/mnist.pkl'

In [None]:
def get_data():
    data = datasets.download_data(MNIST_URL, ext=".gz")
    with gzip.open(data, 'rb') as f:
        ((train_x, train_y),(valid_x, valid_y), _) = pickle.load(f, encoding='latin-1')
    return(map(tensor, (train_x, train_y, valid_x, valid_y)))

In [None]:
train_x, train_y, valid_x, valid_y = get_data()

Downloading http://deeplearning.net/data/mnist/mnist.pkl.gz


In [None]:
train_mean, train_std = train_x.mean(), train_x.std()

In [None]:
train_x.mean(), train_x.std(), valid_x.mean(), valid_x.std()

(tensor(0.1304), tensor(0.3073), tensor(0.1287), tensor(0.3050))

In [None]:
def normalize(x, m, n): return((x-m)/n)

In [None]:
# important!!!
train_x = normalize(train_x, train_mean, train_std)
valid_x = normalize(valid_x, train_mean, train_std)

In [None]:
train_x.mean(), train_x.std()

(tensor(0.0001), tensor(1.))

In [None]:
valid_x.std(), valid_x.mean()

(tensor(0.9924), tensor(-0.0057))

In [None]:
def test_near_zero(x, tol=1e-3): assert x.abs() < tol, f"Not near zero, value: {x}"

In [None]:
test_near_zero(train_x.mean())

In [None]:
test_near_zero(1-train_x.std())

# Foundation version

## Set Parameters with Adequate method

In [None]:
# m, n = train_x.shape
nh = 80

In [None]:
w1, b1 = torch.randn(n, nh), torch.zeros(nh)

In [None]:
w2, b2 = torch.randn(nh, 1), torch.zeros(1)

In [None]:
def describe(x, var: str=''): return(f"{var} mean: {x.mean()}, std: {x.std()}")

In [None]:
describe(w1)

'mean: -0.004014730919152498, std: 1.0009307861328125'

In [None]:
describe(w2)

'mean: 0.054591961205005646, std: 0.8732153177261353'

In [None]:
# standard version of xavier
w1 = w1/math.sqrt(n)

In [None]:
describe(w1)

'mean: -0.00014338297478388995, std: 0.03574752435088158'

In [None]:
w2 = w2/math.sqrt(nh)
describe(w2)

'mean: 0.00301930820569396, std: 0.11207561939954758'

In [None]:
def relu(x): return(x.clamp_min(0.))

In [None]:
def lin(x, a, b): return(x@a + b)

In [None]:
describe(t1)
t1 = lin(valid_x, w1, b1)

'mean: -0.008880119770765305, std: 1.0219519138336182'

In [None]:
a1 = relu(t1)

In [None]:
describe(a1)

'mean: 0.3990243077278137, std: 0.605105996131897'

---

In [None]:
torch.random.initial_seed()

16253574395535731340

In [None]:
## Using kaiming initialization
w1, b1 = torch.randn(n, nh)*math.sqrt(2/n), torch.zeros(nh)
w1.shape

torch.Size([784, 80])

[^1] TODO: Regarding pytorch library, </br>why initialize with zeros?<br/>
uniform vs normal diff?

In [None]:
# Initializing parameters
from torch.nn import init
w1 = torch.zeros(n, nh)
init.kaiming_uniform_(w1, mode='fan_out')
b1 = torch.zeros(nh)

[^4]: in-place function, I understood they don't return value.</br>Inspect inner code

In [None]:
w2= torch.zeros(nh, 1)
init.kaiming_uniform_(w2, mode='fan_out')
b2 = torch.zeros(1)

[^2]: find yourself, why they are transposing when linear, and then why we should defind fan_out, fan_in conversly?

In [None]:
describe(w1)

'mean: -8.060170512180775e-05, std: 0.05042649805545807'

In [None]:
# to make relu with -0.5
def relu(x): return(x.clamp_min(0.)-0.5)

In [None]:
l1 = lin(train_x, w1, b1); print(describe(l1, 'linear_1'))
a1 = relu(l1); print(describe(a1, 'activation_1'))
l2 = lin(a1, w2, b2); print(describe(l2, 'linear_2'))

linear_1 mean: 0.05869802460074425, std: 1.4556964635849
activation_1 mean: 0.10449501127004623, std: 0.8618189692497253
linear_2 mean: -0.8571696877479553, std: 1.0907652378082275


[^3] still there are lots of shortage.... why mean changed again?

In [None]:
def model(x):
    return(lin(relu(lin(x, w1, b1)), w2, b2))

preds = model(train_x)

In [None]:
describe(preds)

' mean: -0.8571696877479553, std: 1.0907652378082275'

In [None]:
%timeit -n 10 model(valid_x)

10 loops, best of 3: 17.6 ms per loop


## Loss Function: MSE

In [None]:
train_y, valid_y = train_y.float(), valid_y.float()

In [None]:
def mse(inp, trg): return(((inp.squeeze(-1)-trg).pow(2)).sum()/inp.shape[0])

In [None]:
# def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

## GRADIENT AND BACKWARD PASS

In [None]:
def mse_grad(inp, trg): inp.g= (inp.squeeze(-1)-trg).unsqueeze(-1)*(2/inp.shape[0])

In [None]:
mse_grad(preds, train_y)

In [None]:
def lin_grad(x, y, w, b):
    # inp: activation, output: preds
    x.g = y.g.view(x.shape[0], -1) @ w.t()
    w.g = x.t() @ y.g.view(x.shape[0], -1)
    b.g = y.g.sum(0)

In [None]:
%timeit -n 10 lin_grad(a1, preds, w2, b2)

10 loops, best of 3: 7.78 ms per loop


In [None]:
def lin_grad(x,y,w,b):
    x.g = y.g @ w.t()
    w.g = x.t() @ y.g
    b.g = y.g.sum(0)

In [None]:
preds.g.shape, w2.t().shape

(torch.Size([50000, 1]), torch.Size([1, 80]))

In [None]:
%timeit -n 10 lin_grad(a1, preds, w2, b2)

10 loops, best of 3: 7.92 ms per loop


In [None]:
def relu_grad(inp, out): inp.g = out.g* (inp>0).float()

In [None]:
def init_params():
    w1, b1 = torch.zeros(train_x.shape[1], nh), torch.zeros(nh)
    w2, b2 = torch.zeros(nh, 1), torch.zeros(1)
    init.kaiming_uniform_(w2, mode='fan_out')
    init.kaiming_uniform_(w1, mode='fan_out')
    return(w1, w2, b1, b2)

w1, w2, b1, b2 = init_params()
w1.shape, w2.shape, b1.shape, b2.shape

(torch.Size([784, 80]), torch.Size([80, 1]), torch.Size([80]), torch.Size([1]))

In [None]:
def fwd_n_bwd(inp, trg):
    lin1 = lin(inp, w1, b1)
    a1 = relu(lin1)
    lin2 = lin(a1, w2, b2)
    loss = mse(lin2, trg)
    #backward
    mse_grad(lin2, trg); print(f"done mse") #lin2=pred, output: lin2.g
    lin_grad(a1, lin2, w2, b2);print(f"done lin2, ")  #out: a1.g
    relu_grad(lin1, a1);print("done relu") # out: lin1.g
    lin_grad(inp, lin1, w1, b1);print(f"done lin1")
fwd_n_bwd(valid_x, valid_y)

done mse
done lin2, 
done relu
done lin1


---

[^5]: Why do we need to save train_x's gradient???</br>it's not parameter

In [None]:
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
xvg = valid_x.g.clone()

In [None]:
w1g.shape, w2g.shape, b1g.shape, b2g.shape, xvg.shape

[^6]: What is clone? what is copy?? what's difference from extend_as

In [None]:
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)
xv2 = valid_x.clone().requires_grad_(True)

In [None]:
def forward(inp):
    lin1 = lin(inp, w12, b12)
    a1 = relu(lin1)
    lin2 = lin(a1, w22, b22)
    return(lin2)

In [None]:
y2 = forward(xv2)

In [None]:
loss = mse(y2, valid_y)

In [None]:
%time loss.backward()

CPU times: user 45.8 ms, sys: 4.9 ms, total: 50.7 ms
Wall time: 144 ms


In [None]:
test_near(w1g, w12.grad)
test_near(w2g, w22.grad)
test_near(b1g, b12.grad)
test_near(b2g, b22.grad)
test_near(xvg, xv2.grad)

[^8]: why he does not use xt.g, w1.g, b1.g...kind of that?

[^7]: Why cloned tensor does not have same attribute?<br/>
w12 is cloned tensor of w1, and w1 had w1.g

# Refactoring

## Layers as classes

In [None]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min_(0.) - 0.5
        return(self.out)

    def backward(self):
        self.inp.g = self.out.g * (self.inp>0).float()

class Lin():
    def __init__(self, w, b):
        self.w = w
        self.b = b

    def __call__(self, inp):
        self.inp = inp
        self.out = self.inp @ self.w + self.b
        return(self.out)
        
    def backward(self):
        # self.inp.g = self.out.g.view(self.inp.shape[0], -1) @ self.w.t()
        self.inp.g = self.out.g @ self.w.t()
        # self.w.g = self.inp.t() @ self.out.g.view(self.inp.shape[0], -1)
        self.w.g = self.inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

class Mse():
    def __call__(self, inp, trg):
        self.inp, self.trg = inp, trg
        self.out = (self.inp.squeeze()-self.trg).pow(2).sum() / self.inp.shape[0]
        return(self.out)

    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze()-self.trg).unsqueeze(-1) / self.inp.shape[0]

class Model():
    def __init__(self, w1, w2, b1, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()

    def __call__(self, inp, out):
        for l in self.layers: inp = l(inp)
        return(self.loss(inp, out))

    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

init_grad()

In [None]:
model = Model(w1, w2, b1, b2)

In [None]:
%time loss = model(train_x, train_y)

CPU times: user 520 ms, sys: 2.4 ms, total: 522 ms
Wall time: 524 ms


In [None]:
%time model.backward()

CPU times: user 193 ms, sys: 130 ms, total: 323 ms
Wall time: 324 ms


In [None]:
test_near(w1g, w1.g)
test_near(w2g, w2.g)
test_near(b1g, b1.g)
test_near(b2g, b2.g)

## Module.forward()

Which use only backwards

[^9]: I can see that this way removed function __call__, but why? how?

Seems like used at module once....?

In [None]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out

    # def forward(self): raise Exception("Not implemented")
    def backward(self): self.bwd(self.out, *self.args)

class Relu(Module):
    def forward(self, inp):
        return inp.clamp_min_(0.) - 0.5
    def bwd(self, out, inp):
        inp.g = (inp>0).float() * out.g

class Lin(Module):
    def __init__(self, w, b):
        self.w, self.b = w, b
    def forward(self, inp):
        return inp @ self.w + self.b
    def bwd(self, out, inp):
        # inp.g = out.g @ self.w.t()
        inp.g = torch.einsum("ia, ka -> ik", out.g, self.w)# [^10]: don't know why I can't do this
        # self.w.g = inp.t() @ out.g
        self.w.g = torch.einsum("bi, bj ->ij", inp, out.g)
        self.b.g = out.g.sum(0)

class Mse(Module):
    def forward(self, inp, trg):
        return (inp.squeeze()-trg).pow(2).sum() / inp.shape[0]
    # def bwd(self, trg, inp):
    def bwd(self, out, inp, trg):        
        inp.g = 2. * (inp.squeeze()-trg).unsqueeze(-1) / inp.shape[0]


class Model():
    # def __init__(self, w1, w2, b1, b2):
    def __init__(self):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
    def __call__(self, inp, trg):
        for l in self.layers: inp = l(inp)
        return self.loss(inp, trg)
    def backwards(self):
        self.loss.backward()
        for r in reversed(self.layers): r.backward()   


In [None]:
def init_grad(): w1.g, w2.g, b1.g, b2.g = [None]*4             

In [None]:
init_grad()
m = Model()
m(train_x, train_y)

tensor(36.6717)

In [None]:
%time m.backwards()

CPU times: user 183 ms, sys: 132 ms, total: 315 ms
Wall time: 315 ms


In [None]:
init_grad()
model= Model()
model(valid_x, valid_y)

tensor(37.3602)

In [None]:
%timeit -n 10 model.backwards()

10 loops, best of 3: 48.1 ms per loop


## without einsum

In [None]:
# del Lin
Lin

NameError: ignored

In [None]:
import gc; gc.collect()

846

In [None]:
class Lin(Module):
    def __init__(self, w, b):
        self.w, self.b = w, b
    def forward(self, inp):
        return inp @ self.w + self.b
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = inp.t() @ out.g
        self.b.g = out.g.sum(0)

In [None]:
init_grad()
m2= Model()
m2(valid_x, valid_y)

tensor(37.3602)

In [None]:
%time m2.backwards()

CPU times: user 45.6 ms, sys: 1.03 ms, total: 46.6 ms
Wall time: 46.6 ms


> ??? Without einsum is the best speed
---

[^10]: Check since jeremy's result is little bit different with me.

## nn.Linear and nn.Module

In [None]:
from torch import nn

In [None]:
inp, out = train_x.shape[0], 1

In [None]:
inp, out

(50000, 1)

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
        self.loss = mse
    def __call__(self, x, y):
        for l in self.layers: x = l(x);
        return self.loss(x.squeeze(), y)

In [None]:
inp, nh, out

(784, 80, 1)

In [None]:
m3 = Model(inp, nh, out)

In [None]:
loss = m3(train_x, train_y)

In [None]:
%time loss.backward()

CPU times: user 90.5 ms, sys: 822 µs, total: 91.3 ms
Wall time: 91.5 ms


[^11]: what's difference bw nn.ReLU / nn.functional.relu ????

[^12]: why did jeremy input the squeezed tensor at loss? ***(Solved), answers are below.***

In [None]:
layers = [nn.Linear(inp, nh), nn.ReLU(), nn.Linear(nh, out)]

In [None]:
x = train_x.clone()

In [None]:
for l in layers: x = l(x)

In [None]:
x.shape

torch.Size([50000, 1])