# 1 Introduction

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math

import torch
import torch.optim
import torch.functional as F

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d

In [2]:
mb_size = 100 # mini-batch size of 100

In [3]:
trans = transforms.Compose([transforms.ToTensor(),
                            transforms.Normalize((0.5, 0.5, 0.5),
                                                 (0.5, 0.5, 0.5))])

In [4]:
dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)

In [5]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)


In [6]:
def init_weights(shape):
    # xavier initialization (a good initialization is important!)
    # http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
    fan_in = shape[0]
    fan_out = shape[1]
    variance = 2.0/(fan_in + fan_out)
    w = torch.randn(size=shape)*np.sqrt(variance)
    w.requires_grad = True
    return w

In [7]:
def rectify(X):
    return torch.max(torch.zeros_like(X), X)

In [8]:
# you can also use torch.nn.functional.softmax on future sheets
def softmax(X):
    c = torch.max(X, dim=1)[0].reshape(mb_size, 1)
    # this avoids a blow up of the exponentials
    # but calculates the same formula
    stabelized = X-c
    exp = torch.exp(stabelized)
    return exp/torch.sum(exp, dim=1).reshape(mb_size, 1)

In [9]:
# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.9, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(-group['lr'], grad, avg)

In [10]:
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    #X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    #h = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    #h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [11]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))

optimizer = RMSprop([w_h, w_h2, w_o])

In [12]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.3650574684143066
Loss: 2.4331960678100586
Loss: 2.265251398086548
Loss: 2.1515004634857178
Loss: 2.122255563735962
Loss: 2.0780086517333984
Loss: 1.9510952234268188
Loss: 1.7534270286560059
Loss: 1.7162812948226929
Loss: 1.6557480096817017
Loss: 1.5992196798324585
Loss: 1.4189345836639404
Loss: 1.5806589126586914
Loss: 1.5623953342437744
Loss: 1.2064529657363892
Loss: 1.1244980096817017
Loss: 1.1663738489151
Loss: 1.2691872119903564
Loss: 1.3251729011535645
Loss: 0.9300680756568909
Loss: 0.9207238554954529
Loss: 0.9960853457450867
Loss: 0.8971188068389893
Loss: 0.9928513169288635
Loss: 0.9217621088027954
Loss: 1.1288933753967285
Loss: 0.9542943835258484
Loss: 0.7582323551177979
Loss: 1.0451061725616455
Loss: 1.059401512145996
Loss: 0.8125346302986145
Loss: 0.8085698485374451
Loss: 0.8077687621116638
Loss: 0.7307479381561279
Loss: 0.9974827766418457
Loss: 0.8421211242675781
Loss: 1.032852053642273
Loss: 0.8811557292938232
Loss: 1.1567177772521973
Loss: 1.06706702709198
Loss: 0.8

Loss: 0.5127851366996765
Loss: 0.5655394196510315
Loss: 0.34945568442344666
Loss: 0.38680610060691833
Loss: 0.3107341229915619
Loss: 0.7074310183525085
Loss: 0.3330828547477722
Loss: 0.676852822303772
Loss: 0.49547135829925537
Loss: 0.6491640210151672
Loss: 0.4973950982093811
Loss: 0.4866896867752075
Loss: 0.5007063150405884
Loss: 0.3366341292858124
Loss: 0.8163063526153564
Loss: 0.664827287197113
Loss: 0.38241255283355713
Loss: 0.43316036462783813
Loss: 0.8723129034042358
Loss: 0.8584028482437134
Loss: 0.639012336730957
Loss: 0.7487651109695435
Loss: 0.7680003643035889
Loss: 0.779984712600708
Loss: 0.4549174904823303
Loss: 0.5873608589172363
Loss: 0.28681111335754395
Loss: 0.3197105824947357
Loss: 0.3368431031703949
Loss: 0.533292293548584
Loss: 0.5929557085037231
Loss: 0.4101201593875885
Loss: 0.4817691743373871
Loss: 0.25115570425987244
Loss: 0.45354145765304565
Loss: 0.5120304226875305
Loss: 0.3162699341773987
Loss: 0.42274218797683716
Loss: 0.5852655172348022
Loss: 0.5165359973907

# 2 Dropout

In [17]:
def dropout1(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.tensor(np.random.binomial(1, p_drop, X.size())).float()
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()

def dropout(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.bernoulli(torch.full(X.shape, p_drop))
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()

In [18]:
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    h = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [19]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))

optimizer = RMSprop([w_h, w_h2, w_o])

In [19]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.7761785984039307
Loss: 2.6229982376098633
Loss: 2.3505163192749023
Loss: 2.4562995433807373
Loss: 2.2963004112243652
Loss: 2.2877907752990723
Loss: 2.326465368270874
Loss: 2.2519524097442627
Loss: 2.257910966873169
Loss: 2.294045925140381
Loss: 2.233964204788208
Loss: 2.2303404808044434
Loss: 2.2559640407562256
Loss: 2.2133121490478516
Loss: 2.2026357650756836
Loss: 2.190990686416626
Loss: 2.220290422439575
Loss: 2.1194369792938232
Loss: 2.217381238937378
Loss: 2.2835404872894287
Loss: 2.217271566390991
Loss: 2.1934566497802734
Loss: 2.1742794513702393
Loss: 2.1087615489959717
Loss: 2.1302084922790527
Loss: 2.1062982082366943
Loss: 2.1814534664154053
Loss: 2.2340080738067627
Loss: 2.0547263622283936
Loss: 2.057990074157715
Loss: 2.0572774410247803
Loss: 1.9716852903366089
Loss: 2.0525991916656494
Loss: 2.092095136642456
Loss: 2.1406333446502686
Loss: 1.951910376548767
Loss: 1.848122000694275
Loss: 2.0032799243927
Loss: 2.099607229232788
Loss: 2.067258358001709
Loss: 1.932070136

Loss: 1.5343044996261597
Loss: 1.470085859298706
Loss: 1.7409405708312988
Loss: 1.3208531141281128
Loss: 1.6532695293426514
Loss: 1.701149344444275
Loss: 1.5775469541549683
Loss: 1.6016736030578613
Loss: 1.3746775388717651
Loss: 1.5244183540344238
Loss: 1.4841586351394653
Loss: 1.537546992301941
Loss: 1.5829201936721802
Loss: 1.5313972234725952
Loss: 1.5151599645614624
Loss: 1.5521986484527588
Loss: 1.5350052118301392
Loss: 1.6204383373260498
Loss: 1.681959867477417
Loss: 1.3975515365600586
Loss: 1.558091402053833
Loss: 1.5642389059066772
Loss: 1.472490668296814
Loss: 1.7206252813339233
Loss: 1.522516131401062
Loss: 1.4833563566207886
Loss: 1.6760822534561157
Loss: 1.4523215293884277
Loss: 1.5783722400665283
Loss: 1.555077075958252
Loss: 1.6597400903701782
Loss: 1.6604149341583252
Loss: 1.4987215995788574
Loss: 1.5114774703979492
Loss: 1.4944146871566772
Loss: 1.5135207176208496
Loss: 1.6269962787628174
Loss: 1.5217090845108032
Loss: 1.389207363128662
Loss: 1.621984601020813
Loss: 1.57

(b) Explanation here!
probably because random dropouts draw the NN away from overfitting/minima and allow for a well trained network to fine-adjust to a certain degree

# 3 Parametric Relu

In [21]:
def PRelu(X,a):
        return torch.where(X > 0, X, a*X)

In [22]:
def model(X, w_h, w_h2, w_o, a, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = PRelu(X @ w_h, a)
    h = dropout(h, p_drop_hidden)
    h2 = PRelu(h @ w_h2, a)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [23]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))
a = torch.tensor([-0.1], requires_grad = True)

optimizer = RMSprop([w_h, w_h2, w_o, a])

In [28]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, a, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    #print("Loss: {:3f}".format(cost))
    print('loss: %.4f' % cost)
    print('a: %.4f' % a)
    optimizer.step()

Loss: 3.289235
loss 3.2892
tensor(1.00000e-02 *
       [-2.1158])
Loss: 3.820173
loss 3.8202
tensor(1.00000e-02 *
       [-2.1728])
Loss: 3.088272
loss 3.0883
tensor(1.00000e-02 *
       [-2.2261])
Loss: 3.668830
loss 3.6688
tensor(1.00000e-02 *
       [-2.2717])
Loss: 3.334632
loss 3.3346
tensor(1.00000e-02 *
       [-2.3077])
Loss: 3.254644
loss 3.2546
tensor(1.00000e-02 *
       [-2.3325])
Loss: 4.470075
loss 4.4701
tensor(1.00000e-02 *
       [-2.3446])
Loss: 3.857010
loss 3.8570
tensor(1.00000e-02 *
       [-2.3458])
Loss: 4.010983
loss 4.0110
tensor(1.00000e-02 *
       [-2.3295])
Loss: 3.515331
loss 3.5153
tensor(1.00000e-02 *
       [-2.2943])
Loss: 3.096645
loss 3.0966
tensor(1.00000e-02 *
       [-2.2424])
Loss: 3.232729
loss 3.2327
tensor(1.00000e-02 *
       [-2.1754])
Loss: 2.795955
loss 2.7960
tensor(1.00000e-02 *
       [-2.0916])
Loss: 2.906326
loss 2.9063
tensor(1.00000e-02 *
       [-1.9936])
Loss: 2.518763
loss 2.5188
tensor(1.00000e-02 *
       [-1.8840])
Loss: 2.89

Loss: 1.866323
loss 1.8663
tensor(1.00000e-03 *
       [-2.4176])
Loss: 1.733698
loss 1.7337
tensor(1.00000e-03 *
       [-3.0910])
Loss: 1.796145
loss 1.7961
tensor(1.00000e-03 *
       [-3.7794])
Loss: 1.828395
loss 1.8284
tensor(1.00000e-03 *
       [-4.4901])
Loss: 1.907035
loss 1.9070
tensor(1.00000e-03 *
       [-5.2274])
Loss: 1.776491
loss 1.7765
tensor(1.00000e-03 *
       [-5.9771])
Loss: 2.077062
loss 2.0771
tensor(1.00000e-03 *
       [-6.7529])
Loss: 1.768762
loss 1.7688
tensor(1.00000e-03 *
       [-7.5386])
Loss: 1.816923
loss 1.8169
tensor(1.00000e-03 *
       [-8.3358])
Loss: 1.821556
loss 1.8216
tensor(1.00000e-03 *
       [-9.1399])
Loss: 1.749877
loss 1.7499
tensor(1.00000e-03 *
       [-9.9551])
Loss: 1.911513
loss 1.9115
tensor(1.00000e-02 *
       [-1.0781])
Loss: 1.882624
loss 1.8826
tensor(1.00000e-02 *
       [-1.1592])
Loss: 2.141543
loss 2.1415
tensor(1.00000e-02 *
       [-1.2381])
Loss: 2.288980
loss 2.2890
tensor(1.00000e-02 *
       [-1.3136])
Loss: 2.55

Loss: 3.196856
loss 3.1969
tensor(1.00000e-02 *
       [-1.9387])
Loss: 3.516930
loss 3.5169
tensor(1.00000e-02 *
       [-2.0036])
Loss: 3.609951
loss 3.6100
tensor(1.00000e-02 *
       [-2.0572])
Loss: 3.316198
loss 3.3162
tensor(1.00000e-02 *
       [-2.0982])
Loss: 3.751873
loss 3.7519
tensor(1.00000e-02 *
       [-2.1259])
Loss: 3.656576
loss 3.6566
tensor(1.00000e-02 *
       [-2.1383])
Loss: 3.722814
loss 3.7228
tensor(1.00000e-02 *
       [-2.1351])
Loss: 3.297795
loss 3.2978
tensor(1.00000e-02 *
       [-2.1121])
Loss: 3.657249
loss 3.6572
tensor(1.00000e-02 *
       [-2.0700])
Loss: 3.600127
loss 3.6001
tensor(1.00000e-02 *
       [-2.0083])
Loss: 2.845869
loss 2.8459
tensor(1.00000e-02 *
       [-1.9224])
Loss: 2.882328
loss 2.8823
tensor(1.00000e-02 *
       [-1.8217])
Loss: 2.494658
loss 2.4947
tensor(1.00000e-02 *
       [-1.7084])
Loss: 2.620872
loss 2.6209
tensor(1.00000e-02 *
       [-1.5866])
Loss: 2.254149
loss 2.2541
tensor(1.00000e-02 *
       [-1.4587])
Loss: 2.17

Loss: 2.464392
loss 2.4644
tensor(1.00000e-02 *
       [-1.5971])
Loss: 2.106827
loss 2.1068
tensor(1.00000e-02 *
       [-1.4705])
Loss: 2.114560
loss 2.1146
tensor(1.00000e-02 *
       [-1.3448])
Loss: 2.460776
loss 2.4608
tensor(1.00000e-02 *
       [-1.2198])
Loss: 2.427544
loss 2.4275
tensor(1.00000e-02 *
       [-1.0972])
Loss: 1.931466
loss 1.9315
tensor(1.00000e-03 *
       [-9.7424])
Loss: 1.995169
loss 1.9952
tensor(1.00000e-03 *
       [-8.5351])
Loss: 2.150693
loss 2.1507
tensor(1.00000e-03 *
       [-7.3611])
Loss: 1.931295
loss 1.9313
tensor(1.00000e-03 *
       [-6.1977])
Loss: 2.613325
loss 2.6133
tensor(1.00000e-03 *
       [-5.0685])
Loss: 3.311715
loss 3.3117
tensor(1.00000e-03 *
       [-3.9835])
Loss: 1.785921
loss 1.7859
tensor(1.00000e-03 *
       [-2.9281])
Loss: 2.518485
loss 2.5185
tensor(1.00000e-03 *
       [-1.8716])
Loss: 2.126100
loss 2.1261
tensor(1.00000e-04 *
       [-8.4290])
Loss: 2.111485
loss 2.1115
tensor(1.00000e-04 *
       [ 1.2693])
Loss: 4.82

Loss: 2.086820
loss 2.0868
tensor(1.00000e-04 *
       [-4.4826])
Loss: 2.923357
loss 2.9234
tensor(1.00000e-04 *
       [ 5.5034])
Loss: 3.556650
loss 3.5567
tensor(1.00000e-04 *
       [ 2.2065])
Loss: 3.933072
loss 3.9331
tensor(1.00000e-03 *
       [-1.2043])
Loss: 1.759226
loss 1.7592
tensor(1.00000e-03 *
       [-2.5870])
Loss: 1.793403
loss 1.7934
tensor(1.00000e-03 *
       [-3.8999])
Loss: 1.784362
loss 1.7844
tensor(1.00000e-03 *
       [-5.1708])
Loss: 1.832796
loss 1.8328
tensor(1.00000e-03 *
       [-6.4045])
Loss: 1.910892
loss 1.9109
tensor(1.00000e-03 *
       [-7.6092])
Loss: 1.926653
loss 1.9267
tensor(1.00000e-03 *
       [-8.7878])
Loss: 1.949395
loss 1.9494
tensor(1.00000e-03 *
       [-9.9419])
Loss: 1.872688
loss 1.8727
tensor(1.00000e-02 *
       [-1.1066])
Loss: 2.788858
loss 2.7889
tensor(1.00000e-02 *
       [-1.2173])
Loss: 2.207045
loss 2.2070
tensor(1.00000e-02 *
       [-1.3300])
Loss: 2.099492
loss 2.0995
tensor(1.00000e-02 *
       [-1.4391])
Loss: 2.14

As one can see, the PRelu is adaptedin each step.