# 1 Introduction

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math

import torch
import torch.optim
import torch.functional as F

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d

In [2]:
mb_size = 100 # mini-batch size of 100

In [3]:
trans = transforms.Compose([transforms.ToTensor(),
                            transforms.Normalize((0.5, 0.5, 0.5),
                                                 (0.5, 0.5, 0.5))])

In [4]:
dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)

In [5]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)


In [6]:
def init_weights(shape):
    # xavier initialization (a good initialization is important!)
    # http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
    fan_in = shape[0]
    fan_out = shape[1]
    variance = 2.0/(fan_in + fan_out)
    w = torch.randn(size=shape)*np.sqrt(variance)
    w.requires_grad = True
    return w

In [7]:
def rectify(X):
    return torch.max(torch.zeros_like(X), X)

In [8]:
# you can also use torch.nn.functional.softmax on future sheets
def softmax(X):
    c = torch.max(X, dim=1)[0].reshape(mb_size, 1)
    # this avoids a blow up of the exponentials
    # but calculates the same formula
    stabelized = X-c
    exp = torch.exp(stabelized)
    return exp/torch.sum(exp, dim=1).reshape(mb_size, 1)

In [9]:
# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.9, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(-group['lr'], grad, avg)

In [10]:
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    #X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    #h = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    #h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [11]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))

optimizer = RMSprop([w_h, w_h2, w_o])

In [12]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.3650574684143066
Loss: 2.4331960678100586
Loss: 2.265251398086548
Loss: 2.1515004634857178
Loss: 2.122255563735962
Loss: 2.0780086517333984
Loss: 1.9510952234268188
Loss: 1.7534270286560059
Loss: 1.7162812948226929
Loss: 1.6557480096817017
Loss: 1.5992196798324585
Loss: 1.4189345836639404
Loss: 1.5806589126586914
Loss: 1.5623953342437744
Loss: 1.2064529657363892
Loss: 1.1244980096817017
Loss: 1.1663738489151
Loss: 1.2691872119903564
Loss: 1.3251729011535645
Loss: 0.9300680756568909
Loss: 0.9207238554954529
Loss: 0.9960853457450867
Loss: 0.8971188068389893
Loss: 0.9928513169288635
Loss: 0.9217621088027954
Loss: 1.1288933753967285
Loss: 0.9542943835258484
Loss: 0.7582323551177979
Loss: 1.0451061725616455
Loss: 1.059401512145996
Loss: 0.8125346302986145
Loss: 0.8085698485374451
Loss: 0.8077687621116638
Loss: 0.7307479381561279
Loss: 0.9974827766418457
Loss: 0.8421211242675781
Loss: 1.032852053642273
Loss: 0.8811557292938232
Loss: 1.1567177772521973
Loss: 1.06706702709198
Loss: 0.8

Loss: 0.5127851366996765
Loss: 0.5655394196510315
Loss: 0.34945568442344666
Loss: 0.38680610060691833
Loss: 0.3107341229915619
Loss: 0.7074310183525085
Loss: 0.3330828547477722
Loss: 0.676852822303772
Loss: 0.49547135829925537
Loss: 0.6491640210151672
Loss: 0.4973950982093811
Loss: 0.4866896867752075
Loss: 0.5007063150405884
Loss: 0.3366341292858124
Loss: 0.8163063526153564
Loss: 0.664827287197113
Loss: 0.38241255283355713
Loss: 0.43316036462783813
Loss: 0.8723129034042358
Loss: 0.8584028482437134
Loss: 0.639012336730957
Loss: 0.7487651109695435
Loss: 0.7680003643035889
Loss: 0.779984712600708
Loss: 0.4549174904823303
Loss: 0.5873608589172363
Loss: 0.28681111335754395
Loss: 0.3197105824947357
Loss: 0.3368431031703949
Loss: 0.533292293548584
Loss: 0.5929557085037231
Loss: 0.4101201593875885
Loss: 0.4817691743373871
Loss: 0.25115570425987244
Loss: 0.45354145765304565
Loss: 0.5120304226875305
Loss: 0.3162699341773987
Loss: 0.42274218797683716
Loss: 0.5852655172348022
Loss: 0.5165359973907

# 2 Dropout

In [17]:
def dropout1(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.tensor(np.random.binomial(1, p_drop, X.size())).float()
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()

def dropout(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.bernoulli(torch.full(X.shape, p_drop))
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()

In [18]:
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    h = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [19]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))

optimizer = RMSprop([w_h, w_h2, w_o])

In [19]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.7761785984039307
Loss: 2.6229982376098633
Loss: 2.3505163192749023
Loss: 2.4562995433807373
Loss: 2.2963004112243652
Loss: 2.2877907752990723
Loss: 2.326465368270874
Loss: 2.2519524097442627
Loss: 2.257910966873169
Loss: 2.294045925140381
Loss: 2.233964204788208
Loss: 2.2303404808044434
Loss: 2.2559640407562256
Loss: 2.2133121490478516
Loss: 2.2026357650756836
Loss: 2.190990686416626
Loss: 2.220290422439575
Loss: 2.1194369792938232
Loss: 2.217381238937378
Loss: 2.2835404872894287
Loss: 2.217271566390991
Loss: 2.1934566497802734
Loss: 2.1742794513702393
Loss: 2.1087615489959717
Loss: 2.1302084922790527
Loss: 2.1062982082366943
Loss: 2.1814534664154053
Loss: 2.2340080738067627
Loss: 2.0547263622283936
Loss: 2.057990074157715
Loss: 2.0572774410247803
Loss: 1.9716852903366089
Loss: 2.0525991916656494
Loss: 2.092095136642456
Loss: 2.1406333446502686
Loss: 1.951910376548767
Loss: 1.848122000694275
Loss: 2.0032799243927
Loss: 2.099607229232788
Loss: 2.067258358001709
Loss: 1.932070136

Loss: 1.5343044996261597
Loss: 1.470085859298706
Loss: 1.7409405708312988
Loss: 1.3208531141281128
Loss: 1.6532695293426514
Loss: 1.701149344444275
Loss: 1.5775469541549683
Loss: 1.6016736030578613
Loss: 1.3746775388717651
Loss: 1.5244183540344238
Loss: 1.4841586351394653
Loss: 1.537546992301941
Loss: 1.5829201936721802
Loss: 1.5313972234725952
Loss: 1.5151599645614624
Loss: 1.5521986484527588
Loss: 1.5350052118301392
Loss: 1.6204383373260498
Loss: 1.681959867477417
Loss: 1.3975515365600586
Loss: 1.558091402053833
Loss: 1.5642389059066772
Loss: 1.472490668296814
Loss: 1.7206252813339233
Loss: 1.522516131401062
Loss: 1.4833563566207886
Loss: 1.6760822534561157
Loss: 1.4523215293884277
Loss: 1.5783722400665283
Loss: 1.555077075958252
Loss: 1.6597400903701782
Loss: 1.6604149341583252
Loss: 1.4987215995788574
Loss: 1.5114774703979492
Loss: 1.4944146871566772
Loss: 1.5135207176208496
Loss: 1.6269962787628174
Loss: 1.5217090845108032
Loss: 1.389207363128662
Loss: 1.621984601020813
Loss: 1.57

(b) Explanation here!
probably because random dropouts draw the NN away from overfitting/minima and allow for a well trained network to fine-adjust to a certain degree

# 3 Parametric Relu

In [21]:
def PRelu(X,a):
        return torch.where(X > 0, X, a*X)

In [22]:
def model(X, w_h, w_h2, w_o, a, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = PRelu(X @ w_h, a)
    h = dropout(h, p_drop_hidden)
    h2 = PRelu(h @ w_h2, a)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [23]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))
a = torch.tensor([-0.1], requires_grad = True)

optimizer = RMSprop([w_h, w_h2, w_o, a])

In [30]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, a, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    #print("Loss: {:3f}".format(cost))
    print('step: ', _)
    print('loss: %.4f' % cost)
    print('a: %.4f' % a)
    optimizer.step()

step:  0
loss: 1.9112
a: -0.0046
step:  1
loss: 1.7840
a: -0.0035
step:  2
loss: 2.0550
a: -0.0024
step:  3
loss: 1.8941
a: -0.0013
step:  4
loss: 1.9640
a: -0.0002
step:  5
loss: 7.8278
a: 0.0008
step:  6
loss: 2.1500
a: -0.0019
step:  7
loss: 1.8856
a: -0.0040
step:  8
loss: 1.9853
a: -0.0058
step:  9
loss: 2.0946
a: -0.0074
step:  10
loss: 2.3781
a: -0.0090
step:  11
loss: 2.0241
a: -0.0104
step:  12
loss: 4.2390
a: -0.0117
step:  13
loss: 2.7668
a: -0.0130
step:  14
loss: 2.7723
a: -0.0142
step:  15
loss: 3.4524
a: -0.0154
step:  16
loss: 3.0168
a: -0.0165
step:  17
loss: 3.9154
a: -0.0175
step:  18
loss: 3.5155
a: -0.0185
step:  19
loss: 4.0209
a: -0.0195
step:  20
loss: 4.7518
a: -0.0204
step:  21
loss: 4.4742
a: -0.0213
step:  22
loss: 5.2670
a: -0.0221
step:  23
loss: 4.9377
a: -0.0228
step:  24
loss: 5.6557
a: -0.0236
step:  25
loss: 5.5684
a: -0.0242
step:  26
loss: 6.0723
a: -0.0248
step:  27
loss: 6.4312
a: -0.0253
step:  28
loss: 6.2490
a: -0.0258
step:  29
loss: 6.5367
a:

step:  238
loss: 2.3429
a: -0.0133
step:  239
loss: 4.1186
a: -0.0142
step:  240
loss: 2.6592
a: -0.0152
step:  241
loss: 2.8555
a: -0.0161
step:  242
loss: 3.5493
a: -0.0170
step:  243
loss: 3.4798
a: -0.0179
step:  244
loss: 3.4678
a: -0.0187
step:  245
loss: 4.0456
a: -0.0195
step:  246
loss: 4.9414
a: -0.0203
step:  247
loss: 4.3551
a: -0.0210
step:  248
loss: 4.3241
a: -0.0216
step:  249
loss: 4.5995
a: -0.0222
step:  250
loss: 5.7498
a: -0.0228
step:  251
loss: 6.6830
a: -0.0233
step:  252
loss: 6.9842
a: -0.0237
step:  253
loss: 5.1153
a: -0.0240
step:  254
loss: 5.8529
a: -0.0243
step:  255
loss: 5.4233
a: -0.0244
step:  256
loss: 4.8622
a: -0.0244
step:  257
loss: 6.5796
a: -0.0243
step:  258
loss: 6.2915
a: -0.0241
step:  259
loss: 4.3274
a: -0.0237
step:  260
loss: 4.3350
a: -0.0232
step:  261
loss: 6.8774
a: -0.0226
step:  262
loss: 4.7455
a: -0.0218
step:  263
loss: 3.7258
a: -0.0209
step:  264
loss: 3.7092
a: -0.0198
step:  265
loss: 3.4305
a: -0.0187
step:  266
loss: 2.8

step:  481
loss: 2.6574
a: -0.0133
step:  482
loss: 2.2000
a: -0.0143
step:  483
loss: 2.5232
a: -0.0153
step:  484
loss: 2.2585
a: -0.0163
step:  485
loss: 3.0406
a: -0.0173
step:  486
loss: 2.5412
a: -0.0183
step:  487
loss: 3.5723
a: -0.0192
step:  488
loss: 2.7401
a: -0.0201
step:  489
loss: 3.6369
a: -0.0209
step:  490
loss: 3.3026
a: -0.0218
step:  491
loss: 3.3768
a: -0.0226
step:  492
loss: 3.8662
a: -0.0233
step:  493
loss: 4.7595
a: -0.0240
step:  494
loss: 4.8280
a: -0.0247
step:  495
loss: 4.4165
a: -0.0253
step:  496
loss: 5.3931
a: -0.0258
step:  497
loss: 5.4893
a: -0.0262
step:  498
loss: 5.2539
a: -0.0266
step:  499
loss: 5.3053
a: -0.0269
step:  500
loss: 6.1748
a: -0.0271
step:  501
loss: 5.0958
a: -0.0271
step:  502
loss: 5.1425
a: -0.0271
step:  503
loss: 5.2241
a: -0.0269
step:  504
loss: 3.9033
a: -0.0266
step:  505
loss: 5.4040
a: -0.0261
step:  506
loss: 3.7165
a: -0.0255
step:  507
loss: 4.5106
a: -0.0248
step:  508
loss: 4.5630
a: -0.0239
step:  509
loss: 4.1

As one can see, the PRelu is adaptedin each step.