# 1 Introduction

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math

import torch
import torch.optim
import torch.functional as F

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d

In [2]:
mb_size = 100 # mini-batch size of 100

In [3]:
trans = transforms.Compose([transforms.ToTensor(),
                            transforms.Normalize((0.5, 0.5, 0.5),
                                                 (0.5, 0.5, 0.5))])

In [4]:
dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)

In [5]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)


In [6]:
def init_weights(shape):
    # xavier initialization (a good initialization is important!)
    # http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
    fan_in = shape[0]
    fan_out = shape[1]
    variance = 2.0/(fan_in + fan_out)
    w = torch.randn(size=shape)*np.sqrt(variance)
    w.requires_grad = True
    return w

In [7]:
def rectify(X):
    return torch.max(torch.zeros_like(X), X)

In [8]:
# you can also use torch.nn.functional.softmax on future sheets
def softmax(X):
    c = torch.max(X, dim=1)[0].reshape(mb_size, 1)
    # this avoids a blow up of the exponentials
    # but calculates the same formula
    stabelized = X-c
    exp = torch.exp(stabelized)
    return exp/torch.sum(exp, dim=1).reshape(mb_size, 1)

In [9]:
# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.9, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(-group['lr'], grad, avg)

In [13]:
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    #X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    #h = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    #h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [14]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))

optimizer = RMSprop([w_h, w_h2, w_o])

In [15]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.8268704414367676
Loss: 2.7285714149475098
Loss: 2.289259910583496
Loss: 2.271031141281128
Loss: 2.20009708404541
Loss: 2.045694351196289
Loss: 1.9569151401519775
Loss: 1.9066400527954102
Loss: 1.8437570333480835
Loss: 1.8244924545288086
Loss: 1.5523459911346436
Loss: 1.493617296218872
Loss: 1.4734643697738647
Loss: 1.4040019512176514
Loss: 1.2545500993728638
Loss: 1.2252566814422607
Loss: 1.1988110542297363
Loss: 1.1603686809539795
Loss: 1.0522253513336182
Loss: 1.0674927234649658
Loss: 1.1157876253128052
Loss: 0.9356104135513306
Loss: 1.149595022201538
Loss: 0.9950889348983765
Loss: 0.8677853941917419
Loss: 1.0581778287887573
Loss: 0.886738121509552
Loss: 0.7940847277641296
Loss: 0.8008765578269958
Loss: 0.8940781950950623
Loss: 0.563798725605011
Loss: 0.6920072436332703
Loss: 0.8638289570808411
Loss: 0.4535703659057617
Loss: 0.9806511402130127
Loss: 0.7706287503242493
Loss: 0.855924129486084
Loss: 0.8537286520004272
Loss: 0.9185181260108948
Loss: 0.5246160626411438
Loss: 0.69

Loss: 0.563618540763855
Loss: 0.5111581087112427
Loss: 0.36514556407928467
Loss: 0.45664486289024353
Loss: 0.3715624213218689
Loss: 0.5820598006248474
Loss: 0.3891925811767578
Loss: 0.7250534296035767
Loss: 0.27294039726257324
Loss: 0.3358617424964905
Loss: 0.4220786988735199
Loss: 0.6052753329277039
Loss: 0.8506615161895752
Loss: 0.6801207065582275
Loss: 0.5049019455909729
Loss: 0.31883618235588074
Loss: 0.37427768111228943
Loss: 0.3218585252761841
Loss: 0.5204296708106995
Loss: 0.22815518081188202
Loss: 0.5129340887069702
Loss: 0.39079907536506653
Loss: 0.2627788782119751
Loss: 0.2520519196987152
Loss: 0.23376873135566711
Loss: 0.4659309685230255
Loss: 0.5273849368095398
Loss: 0.48139050602912903
Loss: 0.31470754742622375
Loss: 0.3491021990776062
Loss: 0.5018615126609802
Loss: 0.24332115054130554
Loss: 0.49822482466697693
Loss: 0.3501926064491272
Loss: 0.4038342237472534
Loss: 0.49790531396865845
Loss: 0.6591052412986755
Loss: 0.5976378917694092
Loss: 0.5134736895561218
Loss: 0.29117

# 2 Dropout

In [16]:
def dropout1(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.tensor(np.random.binomial(1, p_drop, X.size())).float()
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()

def dropout(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.bernoulli(torch.full(X.shape, p_drop))
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()

In [17]:
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    h = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [18]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))

optimizer = RMSprop([w_h, w_h2, w_o])

In [19]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.7761785984039307
Loss: 2.6229982376098633
Loss: 2.3505163192749023
Loss: 2.4562995433807373
Loss: 2.2963004112243652
Loss: 2.2877907752990723
Loss: 2.326465368270874
Loss: 2.2519524097442627
Loss: 2.257910966873169
Loss: 2.294045925140381
Loss: 2.233964204788208
Loss: 2.2303404808044434
Loss: 2.2559640407562256
Loss: 2.2133121490478516
Loss: 2.2026357650756836
Loss: 2.190990686416626
Loss: 2.220290422439575
Loss: 2.1194369792938232
Loss: 2.217381238937378
Loss: 2.2835404872894287
Loss: 2.217271566390991
Loss: 2.1934566497802734
Loss: 2.1742794513702393
Loss: 2.1087615489959717
Loss: 2.1302084922790527
Loss: 2.1062982082366943
Loss: 2.1814534664154053
Loss: 2.2340080738067627
Loss: 2.0547263622283936
Loss: 2.057990074157715
Loss: 2.0572774410247803
Loss: 1.9716852903366089
Loss: 2.0525991916656494
Loss: 2.092095136642456
Loss: 2.1406333446502686
Loss: 1.951910376548767
Loss: 1.848122000694275
Loss: 2.0032799243927
Loss: 2.099607229232788
Loss: 2.067258358001709
Loss: 1.932070136

Loss: 1.5343044996261597
Loss: 1.470085859298706
Loss: 1.7409405708312988
Loss: 1.3208531141281128
Loss: 1.6532695293426514
Loss: 1.701149344444275
Loss: 1.5775469541549683
Loss: 1.6016736030578613
Loss: 1.3746775388717651
Loss: 1.5244183540344238
Loss: 1.4841586351394653
Loss: 1.537546992301941
Loss: 1.5829201936721802
Loss: 1.5313972234725952
Loss: 1.5151599645614624
Loss: 1.5521986484527588
Loss: 1.5350052118301392
Loss: 1.6204383373260498
Loss: 1.681959867477417
Loss: 1.3975515365600586
Loss: 1.558091402053833
Loss: 1.5642389059066772
Loss: 1.472490668296814
Loss: 1.7206252813339233
Loss: 1.522516131401062
Loss: 1.4833563566207886
Loss: 1.6760822534561157
Loss: 1.4523215293884277
Loss: 1.5783722400665283
Loss: 1.555077075958252
Loss: 1.6597400903701782
Loss: 1.6604149341583252
Loss: 1.4987215995788574
Loss: 1.5114774703979492
Loss: 1.4944146871566772
Loss: 1.5135207176208496
Loss: 1.6269962787628174
Loss: 1.5217090845108032
Loss: 1.389207363128662
Loss: 1.621984601020813
Loss: 1.57

(b) Explanation here!
probably because random dropouts draw the NN away from overfitting/minima and allow for a well trained network to fine-adjust to a certain degree

# 3 Parametric Relu

In [20]:
def PRelu(X,a):
        return torch.where(X > 0, X, a*X)

In [21]:
def model(X, w_h, w_h2, w_o, a, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = PRelu(X @ w_h, a)
    h = dropout(h, p_drop_hidden)
    h2 = PRelu(h @ w_h2, a)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [22]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))
a = torch.tensor([-0.1], requires_grad = True)

optimizer = RMSprop([w_h, w_h2, w_o, a])

In [23]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, a, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    print(a)
    optimizer.step()

Loss: 2.5307977199554443
tensor([-0.1000])
Loss: 2.622321367263794
tensor([-0.1032])
Loss: 2.4067304134368896
tensor([-0.1000])
Loss: 2.3277883529663086
tensor([-0.1010])
Loss: 2.3197245597839355
tensor(1.00000e-02 *
       [-9.8731])
Loss: 2.3492014408111572
tensor(1.00000e-02 *
       [-9.5878])
Loss: 2.348719358444214
tensor(1.00000e-02 *
       [-9.3105])
Loss: 2.166813373565674
tensor(1.00000e-02 *
       [-9.0342])
Loss: 2.177947521209717
tensor(1.00000e-02 *
       [-8.7885])
Loss: 2.269373893737793
tensor(1.00000e-02 *
       [-8.5675])
Loss: 2.2585909366607666
tensor(1.00000e-02 *
       [-8.3655])
Loss: 2.2392072677612305
tensor(1.00000e-02 *
       [-8.1608])
Loss: 2.0897912979125977
tensor(1.00000e-02 *
       [-7.9748])
Loss: 2.103684425354004
tensor(1.00000e-02 *
       [-7.8039])
Loss: 2.1360888481140137
tensor(1.00000e-02 *
       [-7.6418])
Loss: 2.218292236328125
tensor(1.00000e-02 *
       [-7.4742])
Loss: 2.0900697708129883
tensor(1.00000e-02 *
       [-7.3223])
Los

Loss: 1.611220359802246
tensor(1.00000e-03 *
       [-1.3215])
Loss: 1.068949580192566
tensor(1.00000e-03 *
       [-3.3813])
Loss: 1.3698468208312988
tensor(1.00000e-03 *
       [-5.0570])
Loss: 1.163787841796875
tensor(1.00000e-03 *
       [-6.6090])
Loss: 1.500837802886963
tensor(1.00000e-03 *
       [-7.0828])
Loss: 1.6064335107803345
tensor(1.00000e-03 *
       [-5.8043])
Loss: 1.3276734352111816
tensor(1.00000e-03 *
       [-3.7705])
Loss: 1.047562599182129
tensor(1.00000e-03 *
       [-2.1946])
Loss: 1.214798092842102
tensor(1.00000e-04 *
       [-7.3250])
Loss: 1.2055529356002808
tensor(1.00000e-04 *
       [ 4.2009])
Loss: 1.3080590963363647
tensor(1.00000e-03 *
       [ 1.6587])
Loss: 1.1697635650634766
tensor(1.00000e-03 *
       [ 2.6362])
Loss: 1.2595913410186768
tensor(1.00000e-03 *
       [ 2.3768])
Loss: 1.1572777032852173
tensor(1.00000e-04 *
       [ 7.0096])
Loss: 1.2675567865371704
tensor(1.00000e-03 *
       [-1.2080])
Loss: 1.2254329919815063
tensor(1.00000e-03 *


Loss: 1.2005417346954346
tensor(1.00000e-03 *
       [-6.3113])
Loss: 1.2915048599243164
tensor(1.00000e-03 *
       [-6.4505])
Loss: 1.1674519777297974
tensor(1.00000e-03 *
       [-6.1824])
Loss: 1.0041338205337524
tensor(1.00000e-03 *
       [-5.5025])
Loss: 1.2410303354263306
tensor(1.00000e-03 *
       [-4.5173])
Loss: 1.1797305345535278
tensor(1.00000e-03 *
       [-3.4474])
Loss: 1.3148618936538696
tensor(1.00000e-03 *
       [-2.3589])
Loss: 1.0487557649612427
tensor(1.00000e-03 *
       [-1.3066])
Loss: 1.1620583534240723
tensor(1.00000e-04 *
       [-4.9474])
Loss: 1.1833609342575073
tensor(1.00000e-04 *
       [ 1.1992])
Loss: 1.3943617343902588
tensor(1.00000e-03 *
       [ 1.4471])
Loss: 0.9734368324279785
tensor(1.00000e-03 *
       [ 1.9497])
Loss: 1.1953359842300415
tensor(1.00000e-04 *
       [ 9.7303])
Loss: 1.2459336519241333
tensor(1.00000e-05 *
       [ 8.0679])
Loss: 1.14519202709198
tensor(1.00000e-05 *
       [-8.8484])
Loss: 1.3013402223587036
tensor(1.00000e-0

Loss: 1.2758526802062988
tensor(1.00000e-04 *
       [-6.6351])
Loss: 1.3784838914871216
tensor(1.00000e-04 *
       [-2.9785])
Loss: 1.4601374864578247
tensor(1.00000e-04 *
       [-8.6726])
Loss: 1.3185627460479736
tensor(1.00000e-03 *
       [-2.0712])
Loss: 1.1767839193344116
tensor(1.00000e-03 *
       [-3.2713])
Loss: 1.2558155059814453
tensor(1.00000e-03 *
       [-4.3839])
Loss: 1.50919508934021
tensor(1.00000e-03 *
       [-5.2278])
Loss: 1.5236337184906006
tensor(1.00000e-03 *
       [-6.0987])
Loss: 1.2066893577575684
tensor(1.00000e-03 *
       [-6.5352])
Loss: 1.2618050575256348
tensor(1.00000e-03 *
       [-6.8705])
Loss: 1.1846778392791748
tensor(1.00000e-03 *
       [-6.1731])
Loss: 1.3614072799682617
tensor(1.00000e-03 *
       [-4.7145])
Loss: 1.188896656036377
tensor(1.00000e-03 *
       [-3.1742])
Loss: 1.2866969108581543
tensor(1.00000e-03 *
       [-1.7476])
Loss: 1.3850998878479004
tensor(1.00000e-04 *
       [-4.2913])
Loss: 1.2575024366378784
tensor(1.00000e-04

Loss: 1.3048418760299683
tensor(1.00000e-03 *
       [-7.0060])
Loss: 1.1744650602340698
tensor(1.00000e-03 *
       [-7.7949])
Loss: 1.5569287538528442
tensor(1.00000e-03 *
       [-8.5059])
Loss: 1.0844371318817139
tensor(1.00000e-03 *
       [-8.9414])
Loss: 1.3887611627578735
tensor(1.00000e-03 *
       [-9.1041])
Loss: 1.3032950162887573
tensor(1.00000e-03 *
       [-9.1204])
Loss: 1.5156968832015991
tensor(1.00000e-03 *
       [-8.7626])
Loss: 1.2857452630996704
tensor(1.00000e-03 *
       [-8.1870])
Loss: 1.4450812339782715
tensor(1.00000e-03 *
       [-7.3384])
Loss: 1.0133841037750244
tensor(1.00000e-03 *
       [-6.3220])
Loss: 1.5994927883148193
tensor(1.00000e-03 *
       [-5.2030])
Loss: 1.1177027225494385
tensor(1.00000e-03 *
       [-4.0868])
Loss: 1.1413968801498413
tensor(1.00000e-03 *
       [-2.9405])
Loss: 1.2888699769973755
tensor(1.00000e-03 *
       [-1.9067])
Loss: 1.020195722579956
tensor(1.00000e-04 *
       [-9.8387])
Loss: 1.0561456680297852
tensor(1.00000e-

As one can see, the PRelu is adaptedin each step.