# 1 Introduction

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math

import torch
import torch.optim
import torch.functional as F

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms

from torch.nn.functional import conv2d, max_pool2d

In [2]:
mb_size = 100 # mini-batch size of 100

In [3]:
trans = transforms.Compose([transforms.ToTensor(),
                            transforms.Normalize((0.5, 0.5, 0.5),
                                                 (0.5, 0.5, 0.5))])

In [4]:
dataset = dset.MNIST("./", download = True,
                     train = True,
                     transform = trans)

In [5]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=mb_size,
                                         shuffle=True, num_workers=1,
                                         pin_memory=True)


In [6]:
def init_weights(shape):
    # xavier initialization (a good initialization is important!)
    # http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
    fan_in = shape[0]
    fan_out = shape[1]
    variance = 2.0/(fan_in + fan_out)
    w = torch.randn(size=shape)*np.sqrt(variance)
    w.requires_grad = True
    return w

In [7]:
def rectify(X):
    return torch.max(torch.zeros_like(X), X)

In [8]:
# you can also use torch.nn.functional.softmax on future sheets
def softmax(X):
    c = torch.max(X, dim=1)[0].reshape(mb_size, 1)
    # this avoids a blow up of the exponentials
    # but calculates the same formula
    stabelized = X-c
    exp = torch.exp(stabelized)
    return exp/torch.sum(exp, dim=1).reshape(mb_size, 1)

In [9]:
# this is an example as a reduced version of the pytorch internal RMSprop optimizer
class RMSprop(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.9, eps=1e-8):
        defaults = dict(lr=lr, alpha=alpha, eps=eps)
        super(RMSprop, self).__init__(params, defaults)

    def step(self):
        for group in self.param_groups:
            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['square_avg'] = torch.zeros_like(p.data)

                square_avg = state['square_avg']
                alpha = group['alpha']

                # update running averages
                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)
                avg = square_avg.sqrt().add_(group['eps'])

                # gradient update
                p.data.addcdiv_(-group['lr'], grad, avg)

In [10]:
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    #X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    #h = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    #h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [11]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))

optimizer = RMSprop([w_h, w_h2, w_o])

In [12]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.543125867843628
Loss: 3.167893409729004
Loss: 2.11494517326355
Loss: 2.336005449295044
Loss: 2.2041916847229004
Loss: 2.149024724960327
Loss: 1.8707449436187744
Loss: 1.8855136632919312
Loss: 2.0242385864257812
Loss: 1.8084663152694702
Loss: 1.807221531867981
Loss: 1.6832900047302246
Loss: 1.727016806602478
Loss: 1.5345144271850586
Loss: 1.6410164833068848
Loss: 1.5259240865707397
Loss: 1.4974876642227173
Loss: 1.4708524942398071
Loss: 1.3217053413391113
Loss: 1.3835594654083252
Loss: 1.2267502546310425
Loss: 1.1142263412475586
Loss: 1.1801670789718628
Loss: 1.141950249671936
Loss: 1.1050403118133545
Loss: 1.2678269147872925
Loss: 1.0788801908493042
Loss: 1.094488501548767
Loss: 0.9235233068466187
Loss: 0.971934974193573
Loss: 0.7941033840179443
Loss: 0.7810167670249939
Loss: 0.805601179599762
Loss: 1.1381202936172485
Loss: 0.82025146484375
Loss: 0.9093353748321533
Loss: 0.8725736141204834
Loss: 0.760343611240387
Loss: 0.9493373036384583
Loss: 0.8345934152603149
Loss: 0.8497349

Loss: 0.43049556016921997
Loss: 0.4028354287147522
Loss: 0.7192733287811279
Loss: 0.42686694860458374
Loss: 0.2545320391654968
Loss: 0.45023685693740845
Loss: 0.2101462483406067
Loss: 0.9105786681175232
Loss: 0.5288601517677307
Loss: 0.7229608297348022
Loss: 0.4575176537036896
Loss: 0.43677496910095215
Loss: 0.30618512630462646
Loss: 0.43474656343460083
Loss: 0.39928972721099854
Loss: 0.5100942254066467
Loss: 0.8629547357559204
Loss: 0.5943202972412109
Loss: 0.38674426078796387
Loss: 0.4581432640552521
Loss: 0.5663266777992249
Loss: 0.8519665002822876
Loss: 0.33807235956192017
Loss: 0.7346905469894409
Loss: 0.3195611834526062
Loss: 0.40302401781082153
Loss: 0.4184427559375763
Loss: 0.32671600580215454
Loss: 0.19888153672218323
Loss: 0.6668321490287781
Loss: 0.48521482944488525
Loss: 0.3711341917514801
Loss: 0.3347781002521515
Loss: 0.5677977204322815
Loss: 0.43925610184669495
Loss: 0.5576645135879517
Loss: 0.5453698635101318
Loss: 0.21910996735095978
Loss: 0.3059163987636566
Loss: 0.35

# 2 Dropout

In [75]:
def dropout1(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.tensor(np.random.binomial(1, p_drop, X.size())).float()
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()

def dropout(X, p_drop=1.):
    if 0 < p_drop < 1:
        phi = torch.bernoulli(torch.full(X.shape, p_drop))
        X = phi*X/p_drop
        return X.float()
    else:
        return X.float()

In [76]:
def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(X @ w_h)
    h = dropout(h, p_drop_hidden)
    h2 = rectify(h @ w_h2)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [77]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))

optimizer = RMSprop([w_h, w_h2, w_o])

In [78]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    optimizer.step()

Loss: 2.9777066707611084
Loss: 2.792482376098633
Loss: 2.452669620513916
Loss: 2.656148910522461
Loss: 2.522226095199585
Loss: 2.391789197921753
Loss: 2.2834012508392334
Loss: 2.2822084426879883
Loss: 2.2742981910705566
Loss: 2.2216081619262695
Loss: 2.1191587448120117
Loss: 2.122025966644287
Loss: 2.1486589908599854
Loss: 2.210784912109375
Loss: 2.3145430088043213
Loss: 2.047504186630249
Loss: 2.0910446643829346
Loss: 2.0555498600006104
Loss: 2.127573013305664
Loss: 2.1899008750915527
Loss: 2.011915922164917
Loss: 2.0761187076568604
Loss: 2.0185554027557373
Loss: 1.9213013648986816
Loss: 1.9938369989395142
Loss: 1.8620680570602417
Loss: 1.8784730434417725
Loss: 1.9026424884796143
Loss: 1.8876065015792847
Loss: 1.908701777458191
Loss: 1.9440159797668457
Loss: 1.9917231798171997
Loss: 1.813230037689209
Loss: 1.7259515523910522
Loss: 1.8802047967910767
Loss: 1.721096396446228
Loss: 1.8942500352859497
Loss: 1.8478379249572754
Loss: 1.7724220752716064
Loss: 1.7721017599105835
Loss: 1.76388

Loss: 1.2464197874069214
Loss: 1.5268865823745728
Loss: 1.0447982549667358
Loss: 1.1179317235946655
Loss: 1.5214200019836426
Loss: 1.4846481084823608
Loss: 1.2078003883361816
Loss: 1.3247840404510498
Loss: 1.2816874980926514
Loss: 1.2159488201141357
Loss: 1.3111481666564941
Loss: 1.355787992477417
Loss: 1.2426345348358154
Loss: 1.4561281204223633
Loss: 1.4086788892745972
Loss: 1.1907093524932861
Loss: 1.4026079177856445
Loss: 1.1233417987823486
Loss: 1.271918773651123
Loss: 1.2502374649047852
Loss: 1.3481303453445435
Loss: 1.5370103120803833
Loss: 1.1817386150360107
Loss: 1.5656694173812866
Loss: 1.1446104049682617
Loss: 1.1795508861541748
Loss: 1.3387999534606934
Loss: 1.0111130475997925
Loss: 1.149440884590149
Loss: 1.4036558866500854
Loss: 1.3407385349273682
Loss: 1.4758350849151611
Loss: 1.2943012714385986
Loss: 1.3095871210098267
Loss: 1.2404696941375732
Loss: 1.2576706409454346
Loss: 1.2606594562530518
Loss: 1.2611839771270752
Loss: 1.2416951656341553
Loss: 1.3786442279815674
Los

# 3 Parametric Relu

In [79]:
def PRelu(X,a):
        X[X < 0] *= a
        return X

In [80]:
def model(X, w_h, w_h2, w_o, a, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = PRelu(X @ w_h, a)
    h = dropout(h, p_drop_hidden)
    h2 = PRelu(h @ w_h2, a)
    h2 = dropout(h2, p_drop_hidden)
    pre_softmax = h2 @ w_o
    return pre_softmax

In [81]:
w_h = init_weights((784, 50))
w_h2 = init_weights((50, 50))
w_o = init_weights((50, 10))
a = torch.tensor([-0.1], requires_grad = True)

optimizer = RMSprop([w_h, w_h2, w_o, a])

In [82]:
# put this into a training loop over 100 epochs
for (_, (X, y)) in enumerate(dataloader, 0):
    noise_py_x = model(X.reshape(mb_size, 784), w_h, w_h2, w_o, a, 0.8, 0.7)
    cost = torch.nn.functional.cross_entropy(noise_py_x, y)
    cost.backward()
    print("Loss: {}".format(cost))
    print(a)
    optimizer.step()

Loss: 3.5792946815490723
tensor([-0.1000])
Loss: 2.3308827877044678
tensor(1.00000e-02 *
       [-9.6838])
Loss: 2.481755256652832
tensor(1.00000e-02 *
       [-9.4184])
Loss: 2.5653727054595947
tensor(1.00000e-02 *
       [-9.2078])
Loss: 2.589242935180664
tensor(1.00000e-02 *
       [-8.9768])
Loss: 2.3537137508392334
tensor(1.00000e-02 *
       [-8.7175])
Loss: 2.3711869716644287
tensor(1.00000e-02 *
       [-8.5124])
Loss: 2.3142757415771484
tensor(1.00000e-02 *
       [-8.3062])
Loss: 2.1950597763061523
tensor(1.00000e-02 *
       [-8.0801])
Loss: 2.3509299755096436
tensor(1.00000e-02 *
       [-7.8741])
Loss: 2.371683359146118
tensor(1.00000e-02 *
       [-7.6462])
Loss: 2.213571548461914
tensor(1.00000e-02 *
       [-7.4286])
Loss: 2.317676067352295
tensor(1.00000e-02 *
       [-7.2300])
Loss: 2.364468574523926
tensor(1.00000e-02 *
       [-7.0300])
Loss: 2.41117000579834
tensor(1.00000e-02 *
       [-6.8315])
Loss: 2.282320499420166
tensor(1.00000e-02 *
       [-6.6379])
Loss: 

Loss: 1.5037728548049927
tensor(1.00000e-03 *
       [-1.5916])
Loss: 1.6913496255874634
tensor(1.00000e-05 *
       [-2.3056])
Loss: 1.507453441619873
tensor(1.00000e-04 *
       [ 5.6297])
Loss: 1.3879448175430298
tensor(1.00000e-04 *
       [-3.1309])
Loss: 1.25860595703125
tensor(1.00000e-03 *
       [-1.7784])
Loss: 1.2854734659194946
tensor(1.00000e-03 *
       [-3.3396])
Loss: 1.5204304456710815
tensor(1.00000e-03 *
       [-4.9642])
Loss: 1.6559078693389893
tensor(1.00000e-03 *
       [-5.7613])
Loss: 1.4377409219741821
tensor(1.00000e-03 *
       [-5.2473])
Loss: 1.5431606769561768
tensor(1.00000e-03 *
       [-3.6358])
Loss: 1.4942843914031982
tensor(1.00000e-03 *
       [-1.5953])
Loss: 1.2114412784576416
tensor(1.00000e-05 *
       [ 4.0083])
Loss: 1.555799126625061
tensor(1.00000e-03 *
       [ 1.2971])
Loss: 1.639043927192688
tensor(1.00000e-03 *
       [ 2.0940])
Loss: 1.461340069770813
tensor(1.00000e-03 *
       [ 1.8517])
Loss: 1.4727301597595215
tensor(1.00000e-04 *


Loss: 1.418965458869934
tensor(1.00000e-03 *
       [-7.9503])
Loss: 1.2321873903274536
tensor(1.00000e-03 *
       [-6.8444])
Loss: 1.2828412055969238
tensor(1.00000e-03 *
       [-5.4544])
Loss: 1.1184210777282715
tensor(1.00000e-03 *
       [-3.9389])
Loss: 1.3173259496688843
tensor(1.00000e-03 *
       [-2.4148])
Loss: 1.347092866897583
tensor(1.00000e-03 *
       [-1.1435])
Loss: 1.117366075515747
tensor(1.00000e-04 *
       [-7.0195])
Loss: 1.2592750787734985
tensor(1.00000e-04 *
       [-8.7871])
Loss: 1.405979037284851
tensor(1.00000e-03 *
       [-1.6560])
Loss: 1.3261746168136597
tensor(1.00000e-03 *
       [-3.0883])
Loss: 1.2883803844451904
tensor(1.00000e-03 *
       [-4.7143])
Loss: 1.1273224353790283
tensor(1.00000e-03 *
       [-6.0334])
Loss: 1.5498366355895996
tensor(1.00000e-03 *
       [-7.2634])
Loss: 1.2449347972869873
tensor(1.00000e-03 *
       [-8.4743])
Loss: 1.252565860748291
tensor(1.00000e-03 *
       [-9.5786])
Loss: 1.298201084136963
tensor(1.00000e-02 *


Loss: 1.470557689666748
tensor(1.00000e-03 *
       [-4.4634])
Loss: 1.3628140687942505
tensor(1.00000e-03 *
       [-3.0593])
Loss: 1.3920527696609497
tensor(1.00000e-03 *
       [-1.7371])
Loss: 1.0459873676300049
tensor(1.00000e-03 *
       [-1.1517])
Loss: 1.4572416543960571
tensor(1.00000e-04 *
       [-6.6046])
Loss: 1.6917701959609985
tensor(1.00000e-03 *
       [-1.8978])
Loss: 1.4773287773132324
tensor(1.00000e-03 *
       [-4.0372])
Loss: 1.4281824827194214
tensor(1.00000e-03 *
       [-5.9905])
Loss: 1.295750379562378
tensor(1.00000e-03 *
       [-7.6444])
Loss: 1.4482492208480835
tensor(1.00000e-03 *
       [-9.0304])
Loss: 1.3754119873046875
tensor(1.00000e-02 *
       [-1.0168])
Loss: 1.339665412902832
tensor(1.00000e-02 *
       [-1.0992])
Loss: 1.4718228578567505
tensor(1.00000e-02 *
       [-1.1527])
Loss: 1.6499202251434326
tensor(1.00000e-02 *
       [-1.1658])
Loss: 1.276034951210022
tensor(1.00000e-02 *
       [-1.1335])
Loss: 1.3479527235031128
tensor(1.00000e-02 

Loss: 1.7447084188461304
tensor(1.00000e-03 *
       [-6.5438])
Loss: 1.5175604820251465
tensor(1.00000e-03 *
       [-8.2601])
Loss: 1.4598640203475952
tensor(1.00000e-03 *
       [-9.7870])
Loss: 1.4292387962341309
tensor(1.00000e-02 *
       [-1.1187])
Loss: 1.4570183753967285
tensor(1.00000e-02 *
       [-1.2467])
Loss: 1.7081387042999268
tensor(1.00000e-02 *
       [-1.3613])
Loss: 1.7819392681121826
tensor(1.00000e-02 *
       [-1.4598])
Loss: 1.9006860256195068
tensor(1.00000e-02 *
       [-1.5447])
Loss: 1.9601951837539673
tensor(1.00000e-02 *
       [-1.6134])
Loss: 1.9594799280166626
tensor(1.00000e-02 *
       [-1.6558])
Loss: 1.8779799938201904
tensor(1.00000e-02 *
       [-1.6744])
Loss: 2.5713717937469482
tensor(1.00000e-02 *
       [-1.6736])
Loss: 2.4487171173095703
tensor(1.00000e-02 *
       [-1.6450])
Loss: 2.3567659854888916
tensor(1.00000e-02 *
       [-1.5899])
Loss: 2.122060537338257
tensor(1.00000e-02 *
       [-1.5041])
Loss: 1.8207502365112305
tensor(1.00000e-