# 1. Introduction

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math

import torch
import torch.optim
import torch.functional as F

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d

In [52]:
mb_size = 100 # mini-batch size of 100


trans = transforms.Compose([transforms.ToTensor(),
                            transforms.Normalize((0.5, 0.5, 0.5),
                                                 (0.5, 0.5, 0.5))])


dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)


dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)



def init_weights(shape):
    w = torch.randn(size=shape)*0.01
    w.requires_grad = True
    return w

def rectify(X):
    return torch.max(torch.zeros_like(X), X)


# you can also use torch.nn.functional.softmax on future sheets
def softmax(X):
    c = torch.max(X, dim=1)[0].reshape(mb_size, 1)
    # this avoids a blow up of the exponentials
    # but calculates the same formula
    stabelized = X-c
    exp = torch.exp(stabelized)
    return exp/torch.sum(exp, dim=1).reshape(mb_size, 1)


# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.9, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(-group['lr'], grad, avg)


def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    #X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    #h_ = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    #h2_ = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax.transpose(0,1)


w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

optimizer = RMSprop([w_h, w_h2, w_o])




# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    #print(np.shape(noise_py_x), np.shape(y))
    noise_py_x = noise_py_x.transpose(0,1)
    #print(np.shape(noise_py_x), np.shape(y))
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.301126480102539
Loss: 2.4601705074310303
Loss: 2.3724472522735596
Loss: 2.301452159881592
Loss: 2.2571959495544434
Loss: 2.173217535018921
Loss: 2.109001874923706
Loss: 1.9273102283477783
Loss: 1.8774402141571045
Loss: 1.8462146520614624
Loss: 1.8301942348480225
Loss: 1.6840736865997314
Loss: 1.693688988685608
Loss: 1.7736531496047974
Loss: 1.5892038345336914
Loss: 1.3629652261734009
Loss: 1.4609931707382202
Loss: 1.217271089553833
Loss: 1.3732244968414307
Loss: 1.1268824338912964
Loss: 1.1890673637390137
Loss: 1.2752197980880737
Loss: 0.9718953967094421
Loss: 0.9312899112701416
Loss: 0.9585617184638977
Loss: 0.7769391536712646
Loss: 1.0969319343566895
Loss: 0.852713406085968
Loss: 0.9407721161842346
Loss: 0.9949538707733154
Loss: 0.9168722033500671
Loss: 1.03035569190979
Loss: 0.7389092445373535
Loss: 0.9261389970779419
Loss: 0.8313544988632202
Loss: 0.9051421284675598
Loss: 0.6579617261886597
Loss: 0.9165657162666321
Loss: 0.7095609903335571
Loss: 0.9638584852218628
Loss: 0.6

Loss: 0.2005351483821869
Loss: 0.32315367460250854
Loss: 0.32866963744163513
Loss: 0.3176599442958832
Loss: 0.3605559468269348
Loss: 0.30380088090896606
Loss: 0.3019169867038727
Loss: 0.3096800744533539
Loss: 0.46932849287986755
Loss: 0.3770848512649536
Loss: 0.3909054696559906
Loss: 0.33027252554893494
Loss: 0.2613484859466553
Loss: 0.4297531545162201
Loss: 0.33085083961486816
Loss: 0.3430494964122772
Loss: 0.38354820013046265
Loss: 0.4369887411594391
Loss: 0.38750502467155457
Loss: 0.622992753982544
Loss: 0.3745587468147278
Loss: 0.4353584945201874
Loss: 0.29142215847969055
Loss: 0.31639865040779114
Loss: 0.4172540009021759
Loss: 0.4980846047401428
Loss: 0.18850763142108917
Loss: 0.52095627784729
Loss: 0.3495292365550995
Loss: 0.5436902642250061
Loss: 0.4675484597682953
Loss: 0.15053628385066986
Loss: 0.2518494725227356
Loss: 0.3862777352333069
Loss: 0.25665906071662903
Loss: 0.467543363571167
Loss: 0.37192878127098083
Loss: 0.5429825782775879
Loss: 0.5663868188858032
Loss: 0.3884362

# 2. Dropout

In [53]:
mb_size = 100 # mini-batch size of 100


trans = transforms.Compose([transforms.ToTensor(),
                            transforms.Normalize((0.5, 0.5, 0.5),
                                                 (0.5, 0.5, 0.5))])


dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)


dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)



def init_weights(shape):
    w = torch.randn(size=shape)*0.01
    w.requires_grad = True
    return w

def rectify(X):
    return torch.max(torch.zeros_like(X), X)


# you can also use torch.nn.functional.softmax on future sheets
def softmax(X):
    c = torch.max(X, dim=1)[0].reshape(mb_size, 1)
    # this avoids a blow up of the exponentials
    # but calculates the same formula
    stabelized = X-c
    exp = torch.exp(stabelized)
    return exp/torch.sum(exp, dim=1).reshape(mb_size, 1)

#(b)
def dropout1(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.tensor(np.random.binomial(1, p_drop, X.size())).float()
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()

def dropout(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.bernoulli(torch.full(X.shape, p_drop))
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()
#end (b)
    

# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.9, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(-group['lr'], grad, avg)


def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    h_ = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    h2_ = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax.transpose(0,1)


w_h = init_weights((784, 625))
w_h2 = init_weights((625, 625))
w_o = init_weights((625, 10))

optimizer = RMSprop([w_h, w_h2, w_o])




# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    #print(np.shape(noise_py_x), np.shape(y))
    noise_py_x = noise_py_x.transpose(0,1)
    #print(np.shape(noise_py_x), np.shape(y))
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.303811550140381
Loss: 2.4472110271453857
Loss: 2.292245864868164
Loss: 2.394937038421631
Loss: 2.237196922302246
Loss: 2.2075231075286865
Loss: 2.1441283226013184
Loss: 2.008239507675171
Loss: 1.9664784669876099
Loss: 1.8960392475128174
Loss: 1.7017848491668701
Loss: 1.8484954833984375
Loss: 1.740090012550354
Loss: 1.3819324970245361
Loss: 1.4581434726715088
Loss: 1.2676910161972046
Loss: 1.4671528339385986
Loss: 1.0352412462234497
Loss: 1.3838337659835815
Loss: 0.9910264611244202
Loss: 1.2821317911148071
Loss: 0.933592677116394
Loss: 0.8181854486465454
Loss: 1.0227619409561157
Loss: 1.1390184164047241
Loss: 0.8551374077796936
Loss: 0.858630359172821
Loss: 0.8524398803710938
Loss: 0.8306090831756592
Loss: 0.764746904373169
Loss: 0.7497779726982117
Loss: 1.2407103776931763
Loss: 0.7896282076835632
Loss: 1.0625354051589966
Loss: 0.7084797620773315
Loss: 0.9143081903457642
Loss: 0.9913882613182068
Loss: 1.1526762247085571
Loss: 0.7158167958259583
Loss: 0.9760000109672546
Loss: 0.5

Loss: 0.32520249485969543
Loss: 0.3866533041000366
Loss: 0.30278462171554565
Loss: 0.4906283915042877
Loss: 0.3926841616630554
Loss: 0.4692765474319458
Loss: 0.3087824285030365
Loss: 0.414774090051651
Loss: 0.4170227348804474
Loss: 0.3196147382259369
Loss: 0.2323639690876007
Loss: 0.3461788594722748
Loss: 0.3920917809009552
Loss: 0.4192280173301697
Loss: 0.30518895387649536
Loss: 0.4699631631374359
Loss: 0.5305116176605225
Loss: 0.2777816653251648
Loss: 0.3318922519683838
Loss: 0.31981778144836426
Loss: 0.5919685959815979
Loss: 0.4439484775066376
Loss: 0.36407792568206787
Loss: 0.5106537342071533
Loss: 0.7767918109893799
Loss: 0.6996302008628845
Loss: 0.28753283619880676
Loss: 0.3969210088253021
Loss: 0.3770645260810852
Loss: 0.4471604824066162
Loss: 0.36707016825675964
Loss: 0.6167523860931396
Loss: 0.31431716680526733
Loss: 0.3210548460483551
Loss: 0.47650691866874695
Loss: 0.30715706944465637
Loss: 0.2842501401901245
Loss: 0.6917865872383118
Loss: 0.43185776472091675
Loss: 0.4195909

(b) Explanation here!
probably because random dropouts draw the NN away from overfitting/minima and allow for a well trained network to fine-adjust to a certain degree

# 3 Parametric Relu

In [None]:
def PRelu(X,a):