In [26]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F

In [27]:
from fastcore.test import test_close

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

MNIST_URL = 'https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'

path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'
from urllib.request import urlretrieve
if not path_gz.exists():
    urlretrieve(MNIST_URL , path_gz)

with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

In [28]:
n,m = x_train.shape
c = y_train.max()+1
nh=50

In [29]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [30]:

model = Model(m, nh, 10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

##Cross Entropy

first we need to compute softmax of our activation 

$softmax(x)$

\begin{equation}
log-softmax(z_j) = log (\frac{e^{z_j}}{\sum_{k=1}^K e^{z_k}})
\end{equation}
<br>
\begin{equation}
log-softmax(z_j) = x - log (\sum_{k=1}^K e^{z_k})
\end{equation}


In [31]:
def log_softmax(x):
  return (x.exp() / x.exp().sum(-1 , keepdim=True)).log()

def log_softmax(x):
  return x - x.exp().sum(-1 , keepdim=True).log()


there is a way to compute the log of the sum of exponentials in a more stable way, called the LogSumExp trick. The idea is to use the following formula:

<br>

$
log (\sum_{k=1}^K e^{z_k}) = log (e^a\sum_{j=1}^n e^{z_j - a}) = a + log (\sum_{j=1}^n e^{z_j - a}) 
 $

 where is maximum among all, now exponential will not explode

In [32]:
def log_softmax(x):
  a = x.max(-1)[0]
  return x - a - (x -a[:,None]).exp().sum(-1 , keepdim=True).log()

def log_softmax(x): 
  return x - x.logsumexp(-1,keepdim=True)


In [34]:
sm = log_softmax(pred)

cross entropy  = $-\sum x log p (x)$

but since $x_i$ are one hot encoding , so we can write it as   $-log p(x_i)$ 
   where i is index of true class 

In [25]:
def nll(input , target):
  return -input[range(target.shape[0]) , target].mean()

In [37]:
loss = nll(sm , y_train)
loss

tensor(2.30, grad_fn=<NegBackward0>)

In [42]:
test_close(loss , F.nll_loss(F.log_softmax(pred,-1),y_train))

In [38]:
test_close(loss , F.cross_entropy(pred,y_train))

so basically = F.cross_entropy = F.nll_loss ( F.softmax(x) , target )  = $-log(p_i)$ where $p_i = x - log (\sum_{k=1}^K e^{z_k})$

In [56]:
loss_func = F.cross_entropy

bs = 50
xb = x_train[0:bs]
pred = model(xb)
pred.shape
yb = y_train[:bs]

In [55]:
#export
def accuracy(out , yb):
  return (out.argmax(dim=1) == yb).float().mean()

def report(loss , preds ,yb):
  print(f"loss : {loss:.2f}  accuracy : {accuracy(preds,yb):.2f}")

In [51]:
accuracy(pred , yb)

tensor(0.08)

In [52]:
lr = .5
epochs=3

In [61]:
for epoch in range(epochs):
  for i in range(0,n,bs):
    s = slice(i , min(n,i+bs))
    xb,yb = x_train[s] , y_train[s]
    pred = model(xb)
    loss = loss_func(pred,yb)
    loss.backward()
    with torch.no_grad():
      for l in model.layers:
        if hasattr(l ,'weight'):
          l.weight -= lr * l.weight.grad
          l.bias -= lr * l.bias.grad
          l.weight.grad.zero_()
          l.bias.grad.zero_()
  report(loss,pred,yb)      
          

loss : 0.07  accuracy : 0.98
loss : 0.06  accuracy : 0.98
loss : 0.03  accuracy : 1.00


###using parameters and optim

In [67]:
m1 = nn.Module()
m1.foo = nn.Linear(2,3)
m1 , list(m1.named_children()) , list(m1.parameters())

(Module(
   (foo): Linear(in_features=2, out_features=3, bias=True)
 ),
 [('foo', Linear(in_features=2, out_features=3, bias=True))],
 [Parameter containing:
  tensor([[ 0.07,  0.51],
          [ 0.49, -0.21],
          [-0.60, -0.20]], requires_grad=True),
  Parameter containing:
  tensor([ 0.44,  0.29, -0.15], requires_grad=True)])

In [68]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in,nh)
        self.l2 = nn.Linear(nh,n_out)
        self.relu = nn.ReLU()
        
    def forward(self, x): return self.l2(self.relu(self.l1(x)))

In [69]:
model =Model(m,nh,10)

In [73]:
for epoch in range(epochs):
  for i in range(0,n,bs):
    s = slice(i , min(n,i+bs))
    xb,yb = x_train[s] , y_train[s]
    pred = model(xb)
    loss = loss_func(pred,yb)
    loss.backward()
    with torch.no_grad():
      for p in model.parameters():
        p -= p.grad * lr
        p.grad.zero_()
  report(loss,pred,yb)    

loss : 0.17  accuracy : 0.94
loss : 0.17  accuracy : 0.96
loss : 0.11  accuracy : 0.96


In [74]:
layers = [nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)]


In [None]:
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        
    def forward(self, x):
        for l in self.layers: x = l(x)
        return x
     

In [88]:
class Optimizer():
  def __init__(self,params , lr=.5):
    self.param = list(params)
    self.lr=lr
  def step(self):
    with torch.no_grad():
      for p in self.param:
        p -= p.grad * self.lr
  
  def zero_grad(self):
    for p in self.param:
      p.grad.data.zero_()

In [89]:
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))


In [90]:
opt = Optimizer(model.parameters())

In [91]:

for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n,i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss : 0.10  accuracy : 0.98
loss : 0.06  accuracy : 0.98
loss : 0.04  accuracy : 1.00


In [93]:
from torch import optim

In [95]:
opt = optim.SGD(model.parameters() , lr=lr)

In [96]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n,i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss : 0.03  accuracy : 1.00
loss : 0.02  accuracy : 1.00
loss : 0.02  accuracy : 1.00


###Dataset and dataloader

In [115]:
class Dataset():
  def __init__(self,x,y):
    self.x=x
    self.y=y
  def __len__(self):
    return len(self.x)
  def __getitem__(self,idx):
    return self.x[idx] , self.y[idx]

In [116]:
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)


In [118]:

for epoch in range(epochs):
    for i in range(0, n, bs):
        xb,yb = train_ds[i:min(n,i+bs)]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
    report(loss, preds, yb)

loss : 0.02  accuracy : 1.00
loss : 0.02  accuracy : 1.00
loss : 0.02  accuracy : 1.00


In [123]:
class DataLoader():
  def __init__(self,ds,bs):
    self.ds,self.bs = ds,bs
  def __iter__(self):
      for i in range(0 , len(self.ds) , self.bs):
        yield self.ds[i:i+self.bs]

In [124]:
train_dl = DataLoader(train_ds, bs)
valid_dl = DataLoader(valid_ds, bs)
     

In [125]:

def fit():
    for epoch in range(epochs):
        for xb,yb in train_dl:
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
        report(loss, preds, yb)

In [126]:
fit()

loss : 0.02  accuracy : 0.98
loss : 0.04  accuracy : 0.98
loss : 0.05  accuracy : 0.98


random sampling

In [127]:
import random
class Sampler():
  def __init__(self,ds,shuffle=True):
    self.n = len(ds)
    self.shuffle=shuffle
  
  def __iter__(self):
    res = list(range(self.n))
    if self.shuffle:
      random.shuffle(res)
    return iter(res)

In [129]:
sampler =Sampler(train_ds)

In [133]:
it = iter(sampler)
for i in range(5):
  print(next(it))

14384
8243
14175
15839
46259


In [139]:
import fastcore.all as fc
from itertools import islice

class BatchSampler():
  def __init__(self,sampler,bs,drop_last=False):
    fc.store_attr()
  def __iter__(self):
    yield from fc.chunked(iter(self.sampler) , self.bs,drop_last = self.drop_last)

In [140]:

batchs = BatchSampler(sampler, 4)
list(islice(batchs, 5))

[[4991, 7249, 42900, 36293],
 [13328, 45454, 15119, 4105],
 [18975, 35679, 14940, 44077],
 [25958, 28677, 9631, 28058],
 [12017, 42339, 28122, 112]]

In [142]:
def collate(b):
  xs,ys = zip(*b)
  return torch.stack(xs),torch.stack(ys)

In [143]:
class DataLoader():
  def __init__(self , ds,batchs,collate_fn = collate):
    fc.store_attr()

  def __iter__(self):
    yield from (self.collate_fn(self.ds[i] for i in b) for b in self.batchs)

In [144]:

train_samp = BatchSampler(Sampler(train_ds, shuffle=True ), bs)
valid_samp = BatchSampler(Sampler(valid_ds, shuffle=False), bs)

In [145]:

train_dl = DataLoader(train_ds, batchs=train_samp)
valid_dl = DataLoader(valid_ds, batchs=valid_samp)

In [148]:
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, BatchSampler


In [149]:
train_samp = BatchSampler(RandomSampler(train_ds),     bs, drop_last=False)
valid_samp = BatchSampler(SequentialSampler(valid_ds), bs, drop_last=False)

In [None]:


train_dl = DataLoader(train_ds, batch_sampler=train_samp, collate_fn=collate)
valid_dl = DataLoader(valid_ds, batch_sampler=valid_samp, collate_fn=collate)


In [None]:
def get_dls(train_ds, valid_ds, bs, **kwargs):
    return (DataLoader(train_ds, batch_size=bs, shuffle=True, **kwargs),
            DataLoader(valid_ds, batch_size=bs*2, **kwargs))