In [1]:
from __future__ import print_function

import numpy as np
import matplotlib.pyplot as plt
import copy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.optim.lr_scheduler as LR

import torchvision
import torchvision.transforms as transforms

In [2]:
transform = transforms.ToTensor()


trainset = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.MNIST(root='./data', train=False,
                                       download=True, transform=transform)

batch_size = 200

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

print('==>>> total training batch number: {}'.format(len(trainloader)))
print('==>>> total testing batch number: {}'.format(len(testloader)))

# for i, data in enumerate(trainloader, 0):
#     print(len(data))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing...
Done!
==>>> total training batch number: 300
==>>> total testing batch number: 50


In [27]:
class NN(nn.Module):
    def __init__(self, layersize):
        super(NN, self).__init__()

    def forward(self, x):
        return x
    
    def update(self, u, v, eta=None):
        pass
    
class BN(nn.Module):
    def __init__(self, layersize, eta=None):
        super(BN, self).__init__()
        self.gain = nn.Parameter(torch.ones(layersize))
        self.bias = nn.Parameter(torch.zeros(layersize))
        
    def forward(self, x):
        
        beta = x.mean(0, keepdim=True)
        alpha = ((x-beta)**2).mean(0, keepdim=True).sqrt()

        # Normalize
        nx = (x-beta)/alpha

        # Adjust using learned parameters
        o = self.gain*nx + self.bias
        return o

    def update(self, u, v, eta=None):
        pass

class IP(nn.Module):
    def __init__(self, layersize, eta=1):
        super(IP, self).__init__()
        self.eta = eta
        
        # gain/bias are the learned output distribution params
        self.gain = nn.Parameter(torch.ones(layersize))
        self.bias = nn.Parameter(torch.zeros(layersize))
        
        # Alpha and beta are the ip normalization parameters
        self.register_buffer('alpha', torch.ones(layersize))
        self.register_buffer('beta', torch.zeros(layersize))
        
    def forward(self, x):

        # Normalize
        nx = (x-self.beta)/self.alpha

        # Adjust using learned parameters
        o = self.gain*nx + self.bias
        return  o
        
    def update(self, u, v, eta=None):
        
        if (eta is None):
            eta = self.eta
        
        Eu = u.mean(0, keepdim=True)
        Euu = (u**2).mean(0, keepdim=True)
        Ev = v.mean(0, keepdim=True)
        Evv = (v**2).mean(0, keepdim=True)
        
        
        self.alpha.data = (1-eta)*self.alpha + eta * (1/self.alpha - 2*(Euu - Eu**2))

        self.beta.data = (1-eta)*self.beta + eta * (-2 * Ev)


class Net(nn.Module):
    def __init__(self, layersize, norm=None, eta=1):
        super(Net, self).__init__()
        
        # Dense Layers
        self.fc1 = nn.Linear(28*28, layersize)
        self.fc2 = nn.Linear(layersize, layersize)
        self.fc3 = nn.Linear(layersize, 10)
        
        # Normalization Layers
        self.n1 = norm(layersize, eta)
        self.n2 = norm(layersize, eta)
        
    def forward(self, x, eta=None):
        x = x.view(-1, 28*28)
        u1 = self.fc1(x)
        v1 = F.tanh(self.n1(u1))
        u2 = self.fc2(v1)
        v2 = F.tanh(self.n2(u2))
        # Note you should not normalize after the last linear layer (you delete info)
        o = F.relu(self.fc9(v2))
        
        # Lets do the updates to the normalizations
        self.n1.update(u1, v1, eta)
        self.n2.update(u2, v2, eta)
        
        return o

class DNet(nn.Module):
    def __init__(self, layersize, norm=None, eta=1):
        super(DNet, self).__init__()
        
        # Dense Layers
        self.fc1 = nn.Linear(28*28, layersize)
        self.fc2 = nn.Linear(layersize, layersize)
        self.fc3 = nn.Linear(layersize, layersize)
        self.fc4 = nn.Linear(layersize, layersize)
        self.fc5 = nn.Linear(layersize, layersize)
        self.fc6 = nn.Linear(layersize, layersize)
        self.fc7 = nn.Linear(layersize, layersize)
        self.fc8 = nn.Linear(layersize, layersize)
        self.fc9 = nn.Linear(layersize, 10)
        
        # Normalization Layers
        self.n1 = norm(layersize, eta)
        self.n2 = norm(layersize, eta)
        self.n3 = norm(layersize, eta)
        self.n4 = norm(layersize, eta)
        self.n5 = norm(layersize, eta)
        self.n6 = norm(layersize, eta)
        self.n7 = norm(layersize, eta)
        self.n8 = norm(layersize, eta)
        
    def forward(self, x, eta=None):
        x = x.view(-1, 28*28)
        u1 = self.fc1(x)
        v1 = F.tanh(self.n1(u1))
        u2 = self.fc2(v1)
        v2 = F.tanh(self.n2(u2))
        u3 = self.fc3(v2)
        v3 = F.tanh(self.n3(u3))
        u4 = self.fc4(v3)
        v4 = F.tanh(self.n4(u4))
        u5 = self.fc5(v4)
        v5 = F.tanh(self.n5(u5))
        u6 = self.fc6(v5)
        v6 = F.tanh(self.n6(u6))
        u7 = self.fc7(v6)
        v7 = F.tanh(self.n7(u7))
        u8 = self.fc8(v7)
        v8 = F.tanh(self.n8(u8))
        # Note you should not normalize after the last linear layer (you delete info)
        o = F.relu(self.fc9(v8))
        
        # Lets do the updates to the normalizations
        self.n1.update(u1, v1, eta)
        self.n2.update(u2, v2, eta)
        self.n3.update(u3, v3, eta)
        self.n4.update(u4, v4, eta)
        self.n5.update(u5, v5, eta)
        self.n6.update(u6, v6, eta)
        self.n7.update(u7, v7, eta)
        self.n8.update(u8, v8, eta)
        
        
        return o

In [30]:

def train_deep_model(network, optimization, seed):
    
    criterion = nn.CrossEntropyLoss()
    torch.manual_seed(seed)
    
    for epoch in range(30):  # loop over the dataset multiple times

        running_loss = 0.0

        for i, data in enumerate(trainloader):
            
            # get the inputs
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimization.zero_grad()

            # forward + backward + optimize
            y = network(inputs)
            loss = criterion(y, labels)
            loss.backward()
            optimization.step()

            # print statistics
            running_loss += loss.item()
            if i % 100 == 99:
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
            
    print("Finished training!\n")

In [31]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

seed = 986

INPUTSIZE = 28*28
NBLAYERS = 9
LAYERSIZE = 25

# #Train IP Model
# torch.manual_seed(seed)
# IPnet = Net()
# IPnet = IPnet.to(device)

# optimizer1 = optim.Adam(IPnet.parameters(), lr=0.001)
# print("Training IP Net")
# train_model(IPnet, optimizer1, seed, True)

# #Train Standard Model
# torch.manual_seed(seed)
# net = Net()
# net = net.to(device)

# optimizer2 = optim.Adam(net.parameters(), lr=0.001)
# print("Training Standard Net")
# train_model(net, optimizer2, seed, False)

#Train Deep IP Model
torch.manual_seed(seed)
DIPnet = DNet(LAYERSIZE, IP, eta=0.5)
DIPnet = DIPnet.to(device)

optimizer = optim.Adam(DIPnet.parameters(), lr=0.0001)
print("Training Deep IP Net")
train_deep_model(DIPnet, optimizer, seed)

#Train Deep Standard Model
torch.manual_seed(seed)
DBNnet = DNet(LAYERSIZE, BN, eta=0.5)
DBNnet = DBNnet.to(device)

optimizer = optim.Adam(DBNnet.parameters(), lr=0.0001)
print("Training Deep Standard Net")
train_deep_model(DBNnet, optimizer, seed)

# for param in IPnet.parameters():
#     print(param.data)
    
# for param in IPnet.parameters():
#     print(param.data)

# c_net = copy.deepcopy(net)

Training Deep IP Net
[1,   100] loss: 0.118
[1,   200] loss: 0.116
[1,   300] loss: 0.115
[2,   100] loss: 0.115
[2,   200] loss: 0.115
[2,   300] loss: 0.115
[3,   100] loss: 0.115
[3,   200] loss: 0.115
[3,   300] loss: 0.115
[4,   100] loss: 0.115
[4,   200] loss: 0.115
[4,   300] loss: 0.115
[5,   100] loss: 0.115
[5,   200] loss: 0.115


KeyboardInterrupt: 

In [10]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        u1, y1, u2, y2, u3, y3, u4, y4, u5, y5, u6, y6, u7, y7, u8, y8, u9, y9  = DIPnet(images)
        _, predicted = torch.max(y9.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the IP network on the 10000 test images: %d %%' % (
    100 * correct / total))

correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        u1, y1, u2, y2, u3, y3, u4, y4, u5, y5, u6, y6, u7, y7, u8, y8, u9, y9  = Dnet(images)
        _, predicted = torch.max(y9.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the standard network on the 10000 test images: %d %%' % (
    100 * correct / total))

# val1, ind1 = DIPnet.fc1.weight.max(0)
# max_weight = val1.max(0)
# print(max_weight)

# val1, ind1 = Dnet.fc1.weight.max(0)
# max_weight = val1.max(0)
# print(max_weight)

Accuracy of the IP network on the 10000 test images: 66 %
Accuracy of the standard network on the 10000 test images: 75 %
