Re-copy Refactoring part of `01_fully_connected.ipynb`

Refactoring
- Layers as classes
- forward module
- without einsum
- nn.Linear / nn.Module

In [None]:
from torch import tensor
from fastai import datasets
import gzip, pickle
from math import sqrt

In [None]:
MNIST_URL = 'http://deeplearning.net/data/mnist/mnist.pkl'

In [None]:
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        (x_train, y_train), (x_valid, y_valid), _ = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train, y_train, x_valid, y_valid))

In [None]:
x_train, y_train, x_valid, y_valid = get_data()

In [None]:
def describe(x): return (x.shape, x.type(), f"mean: {x.mean()}, std: {x.std()}")

In [None]:
describe(x_train)

(torch.Size([50000, 784]),
 'torch.FloatTensor',
 'mean: 0.1304190456867218, std: 0.30728983879089355')

Dataset Normalization
---

In [None]:
def normalize(x, m, n): return (x-m)/n

In [None]:
x_mean, x_std = x_train.mean(), x_train.std()
describe(x_train), describe(x_valid)

((torch.Size([50000, 784]),
  'torch.FloatTensor',
  'mean: 0.1304190456867218, std: 0.30728983879089355'),
 (torch.Size([10000, 784]),
  'torch.FloatTensor',
  'mean: 0.12865190207958221, std: 0.30496466159820557'))

In [None]:
x_train, x_valid = normalize(x_train, x_mean, x_std) , normalize(x_valid, x_mean, x_std)

In [None]:
describe(x_train)

(torch.Size([50000, 784]),
 'torch.FloatTensor',
 'mean: 0.00012300178059376776, std: 1.0')

In [None]:
describe(x_valid)

(torch.Size([10000, 784]),
 'torch.FloatTensor',
 'mean: -0.005747819785028696, std: 0.9924333691596985')

parameter init with kaiming normalization
---

In [None]:
n, c, nh = x_train.shape[1], 1, 80 # int(y_train.max()+1), 80 : when do cross-entropy, softmax
n,nh, c

(784, 80, 1)

In [None]:
from torch import randn, zeros

* refered eq.10 of [paper](), mistaken it and at first, I *divided* by sqrt(2/n), not multiplying!!!!!!!!!

In [None]:
w1 = randn(n, nh) * sqrt(2/n)
b1 = zeros(nh)
w2 = randn(nh, c) * sqrt(2/nh)
b2 = zeros(c) 

In [None]:
describe(w1)

(torch.Size([784, 80]),
 'torch.FloatTensor',
 'mean: 0.00014483169070445, std: 0.050243474543094635')

* Why bias is okay without normalization?

---

function : loss function, linear computation, activation function
---

In [None]:
#input: n by 1 FloatTensor, trg: n LongTensor, #output: n by 1 floattensor
def mse_loss(pred, trg): return((pred.squeeze() - trg.float()).pow(2).sum(0).unsqueeze(-1)/pred.shape[0])

In [None]:
def lin(x, w, b): return(x@w+b)

In [None]:
def relu(x): return(x.clamp_min(0.))

In [None]:
l1 = lin(x_train, w1, b1); l1.shape

torch.Size([50000, 80])

In [None]:
a1 = relu(l1); describe(a1)

(torch.Size([50000, 80]),
 'torch.FloatTensor',
 'mean: 0.5839827656745911, std: 0.837601900100708')

In [None]:
l2 = lin(a1, w2, b2); describe(l2)

(torch.Size([50000, 1]),
 'torch.FloatTensor',
 'mean: -1.4752967357635498, std: 0.9234262108802795')

- I think jeremy advised us to subtract 0.5 at relu. but when I go through l2, mean is near 0.1, which is better not to subtract 0.5


In [None]:
# what if I applied-0.5 and did linear2?
describe(lin(relu(l1)-0.5, w2, b2))

(torch.Size([50000, 1]),
 'torch.FloatTensor',
 'mean: -1.0406056642532349, std: 0.9234262108802795')

In [None]:
# little better, almost amount of number I subtracted

In [None]:
def relu(x): return(x.clamp_min(0.)-0.5)

In [None]:
a1 = relu(l1)
l2 = lin(a1, w2, b2)

In [None]:
loss = mse_loss(l2, y_train)

In [None]:
loss

tensor([37.8519])

In [None]:
w1.g, w2.g, b1.g, b2.g = [None]*4

In [None]:
def mse_grad(inp, trg): inp.g= (inp.squeeze(-1)-trg).unsqueeze(-1)*(2/inp.shape[0])

In [None]:
mse_grad(l2, y_train)

In [None]:
def lin_grad(x,y,w,b):
    x.g = y.g @ w.t()
    w.g = x.t() @ y.g
    b.g = y.g.sum(0)

In [None]:
lin_grad(a1, l2, w2, b2)

In [None]:
def relu_grad(inp, out): inp.g = out.g* (inp>0).float()

In [None]:
relu_grad(l1, a1)

In [None]:
lin_grad(x_train, l1, w1, b1)

Be cautious not to do deep copy....

In [None]:
# save value to check the value
w1g, w2g, b1g, b2g = w1.g.clone(), w2.g.clone(), b1.g.clone(), b2.g.clone()

In [None]:
from torch import allclose
def test(a,b,cmp,cname=None):
    if cname is None: cname=cmp.__name__
    assert cmp(a,b),f"{cname}:\n{a}\n{b}"
def near(a,b): return allclose(a, b, rtol=1e-3, atol=1e-5)
def test_near(a,b): test(a,b,near)

In [None]:
# be cautious now, the size of y_i is 1

layers as class
===

In [None]:
class Mse():
    # def __init__()
    def __call__(self, preds, trg):
        self.inp, self.out = preds, trg.float()
        return((preds.squeeze() - self.out).pow(2).sum().unsqueeze(-1) / preds.shape[0]) 
    def backward(self):
        self.inp.g = ((self.inp.squeeze(-1) - self.out) * 2 / self.inp.shape[0]).unsqueeze(-1)

In [None]:
class Lin():
    def __init__(self, w, b): self.w, self.b = w, b
    def __call__(self, x):
        self.inp = x
        self.out = self.inp @ self.w + self.b
        return(self.out)
    def backward(self):
        self.w.g = self.inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)
        self.inp.g = self.out.g @ self.w.t()

In [None]:
class Relu():
    def __call__(self, x):
        self.inp, self.out = x, x.clamp_min(0.) - 0.5
        return(self.out)
    def backward(self):
        self.inp.g = (self.inp > 0).float() * self.out.g

In [None]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()

    def __call__(self, x, y):
        for l in self.layers: x = l(x)
        self.pred = x
        return(self.loss(self.pred, y))

    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [None]:
w1.g, w2.g, b1.g, b2.g = [None]*4

In [None]:
m = Model(w1, b1, w2, b2)

In [None]:
m(x_train, y_train)

tensor([37.8519])

In [None]:
m.backward()

In [None]:
test_near(w1g, w1.g)

In [None]:
test_near(w2g, w2.g)

In [None]:
test_near(w2g, randn(w2g.shape))

AssertionError: ignored