<a href="https://colab.research.google.com/github/SpellOnYou/CLab21/blob/main/dummymodule/2021-05-10-batch_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random, torch, math
# !pip install -q ipdb
# import ipdb

In [2]:
class Linear():
    def __init__(self, w, b):
        self.w, self.b = w, b
    def forward(self, x): 
        self.inp = x
        self.out = self.inp@self.w + self.b
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

In [3]:
class Relu():
    def forward(self, x):
        self.inp = x
        self.out = x.clamp_min(0.) - 0.5
        return self.out

    def backward(self):
        self.inp.g = self.out.g* (self.inp>0).float()

In [27]:
class Mse():
    def __call__(self, yhat, y):
        self.yhat, self.y = yhat, y
        self.out = (yhat.squeeze(-1) - y).pow(2).mean()
        return self.out
    
    def backward(self):
        self.yhat.g = 2. * (self.yhat.squeeze() - self.y).unsqueeze(-1) / self.y.shape[0]

In [None]:
#@title
class Softmax():
    def forward(self, x):
        self.inp = x
        self.out = (x.exp()/ x.exp().sum(-1, keepdim=True))
        return self.out

    def backward(self):
        self.inp.g = (1-self.out.g)*self.out.g

class CrossEntropy():
    def __call__(self, y, yhat):
        self.yhat = yhat
        self.y = y
        return (-yhat.log()[range(yhat.shape[0]), y.max(dim=1).indices]).mean()
    
    def backward(self):
        self.yhat.g = self.yhat - self.y

In [28]:
class DummyModel():
    def __init__(self, w1, b1, w2, b2):
        self._loss = Mse()
        self.layers = [Linear(w1,b1), Relu(), Linear(w2, b2)]
    
    def forward(self, x, y):
        self.x, self.y = x, y

        for layer in self.layers:
            # ipdb.set_trace()
            x = layer.forward(x)
        self.out = self._loss(x, self.y)
        return self.out

    def backward(self):
        self._loss.backward()
        for layer in reversed(self.layers):
            # ipdb.set_trace()
            layer.backward()

In [29]:
def init_data():
    x = torch.randn(n, m)
    y = torch.tensor(random.choices(range(6), k=n))
    # y = torch.zeros(n, c)
    # y[range(n), random.choices(range(c), k=n)] =1     
    return x, y    

In [57]:
n,m, h, c = 5000, 100, 50, 1
w1 = torch.randn(m, h) / math.sqrt(h)
w2 = torch.randn(h, c)
b1 = torch.randn(h)
b2 = torch.randn(c)

In [58]:
dummy_x, dummy_y = init_data()

In [59]:
dummy_x.shape, dummy_y.shape

(torch.Size([5000, 100]), torch.Size([5000]))

In [60]:
model = DummyModel(w1, b1, w2, b2)

In [61]:
loss = model.forward(dummy_x, dummy_y)

In [62]:
loss

tensor(20.7741)

In [63]:
model.backward()

In [64]:
[hasattr(l, 'w') for l in model.layers]

[True, False, True]

## Training process

In [65]:
def train(epochs, bs, lr):
    for e in range(epochs):
        for bs_i in range((n-1)//bs + 1):
            str_idx, end_idx = bs_i*bs, (bs_i+1)*bs
            x_batch, y_batch = dummy_x[str_idx:end_idx], dummy_y[str_idx:end_idx]
            loss = model.forward(x_batch, y_batch)
            model.backward()
            with torch.no_grad():
                for layer in model.layers:
                    if hasattr(layer, 'w'): #if they have parameter attribute
                        print(f"{layer.w.mean(), layer.w.std()}")
                        layer.w -= layer.w.g * lr
                        layer.b -= layer.b.g * lr
                        layer.w.g.zero_() #initialize them to zero
                        layer.b.g.zero_()

In [66]:
train(1, 32, 0.01)

(tensor(-0.0028), tensor(0.1404))
(tensor(-0.0303), tensor(0.9379))
(tensor(-0.0029), tensor(0.1402))
(tensor(-0.0276), tensor(0.9332))
(tensor(-0.0029), tensor(0.1400))
(tensor(-0.0270), tensor(0.9295))
(tensor(-0.0027), tensor(0.1398))
(tensor(-0.0278), tensor(0.9225))
(tensor(-0.0025), tensor(0.1396))
(tensor(-0.0260), tensor(0.9168))
(tensor(-0.0025), tensor(0.1394))
(tensor(-0.0267), tensor(0.9113))
(tensor(-0.0027), tensor(0.1391))
(tensor(-0.0291), tensor(0.9063))
(tensor(-0.0026), tensor(0.1390))
(tensor(-0.0274), tensor(0.9024))
(tensor(-0.0026), tensor(0.1389))
(tensor(-0.0287), tensor(0.8982))
(tensor(-0.0027), tensor(0.1388))
(tensor(-0.0274), tensor(0.8939))
(tensor(-0.0027), tensor(0.1386))
(tensor(-0.0264), tensor(0.8916))
(tensor(-0.0026), tensor(0.1385))
(tensor(-0.0241), tensor(0.8890))
(tensor(-0.0027), tensor(0.1382))
(tensor(-0.0274), tensor(0.8827))
(tensor(-0.0028), tensor(0.1380))
(tensor(-0.0266), tensor(0.8794))
(tensor(-0.0026), tensor(0.1379))
(tensor(-0.026

In [67]:
loss = model.forward(dummy_x, dummy_y); loss

tensor(3.0660)

- learning rate 0.1 None -> 0.01 -> cool!