In [1]:
import torch
import torch.nn as nn
import torchvision
import torch.optim as optim

In [2]:
batch_size = 100
epochs = 25
num_classes=10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data_transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_set = torchvision.datasets.MNIST('.data/', train=True, download=True, transform=data_transforms)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)

test_set = torchvision.datasets.MNIST('.data/', train=False, download=True, transform=data_transforms)
test_loader = torch.utils.data.DataLoader(test_set,batch_size=batch_size, shuffle=True)

In [4]:
x_batch, y_batch = iter(train_loader).next()
print("Training set: {} samples - Max value: {} - Min value: {}".format(len(train_loader.dataset), 
                                                                        x_batch.max(), x_batch.min()))

Training set: 60000 samples - Max value: 1.0 - Min value: 0.0


In [5]:
x_batch, y_batch = iter(test_loader).next()
print("Test set: {} samples - Max value: {} - Min value: {}".format(len(test_loader.dataset), 
                                                                        x_batch.max(), x_batch.min()))

Test set: 10000 samples - Max value: 1.0 - Min value: 0.0


In [6]:
print("Example batch shape: {}".format(x_batch.shape))

Example batch shape: torch.Size([100, 1, 28, 28])


In [7]:
class GaussianNoise(nn.Module):
    """Gaussian noise regularizer.

    Args:
        sigma (float, optional): relative standard deviation used to generate the
            noise. Relative means that it will be multiplied by the magnitude of
            the value your are adding the noise to. This means that sigma can be
            the same regardless of the scale of the vector.
        is_relative_detach (bool, optional): whether to detach the variable before
            computing the scale of the noise. If `False` then the scale of the noise
            won't be seen as a constant but something to optimize: this will bias the
            network to generate vectors with smaller values.
    """

    def __init__(self, sigma=0.1, is_relative_detach=True):
        super().__init__()
        self.sigma = sigma
        self.is_relative_detach = is_relative_detach
        self.noise = torch.tensor(0).to(device).float()

    def forward(self, x):
        if self.training and self.sigma != 0:
            scale = self.sigma * x.detach() if self.is_relative_detach else self.sigma * x
            sampled_noise = self.noise.repeat(*x.size()).normal_() * scale
            x = x + sampled_noise
        return x 

In [8]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.gn0 = GaussianNoise(0.3)
        self.linear1 = nn.Linear(784, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.gn1 = GaussianNoise(0.3)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(1024, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.gn2 = GaussianNoise(0.3)
        self.relu2 = nn.ReLU()
        self.linear3 = nn.Linear(1024, 1024)
        self.bn3 = nn.BatchNorm1d(1024)
        self.gn3 = GaussianNoise(0.3)
        self.relu3 = nn.ReLU()
        self.classifier = nn.Linear(1024, num_classes)

    def forward(self, x):
        out = self.gn0(x)
        out = self.relu1(self.gn1(self.bn1(self.linear1(out))))
        out = self.relu2(self.bn2(self.linear2(out)))
        out = self.relu3(self.bn3(self.linear3(out)))
        out = self.classifier(out)       
        return out

net = Net().to(device)
print(net)

Net(
  (gn0): GaussianNoise()
  (linear1): Linear(in_features=784, out_features=1024, bias=True)
  (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gn1): GaussianNoise()
  (relu1): ReLU()
  (linear2): Linear(in_features=1024, out_features=1024, bias=True)
  (bn2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gn2): GaussianNoise()
  (relu2): ReLU()
  (linear3): Linear(in_features=1024, out_features=1024, bias=True)
  (bn3): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (gn3): GaussianNoise()
  (relu3): ReLU()
  (classifier): Linear(in_features=1024, out_features=10, bias=True)
)


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, weight_decay=1e-6, momentum=0.9)

In [10]:
print("\n---- Start Training ----")
best_accuracy = -1
for epoch in range(epochs):

    # TRAIN THE NETWORK
    train_loss, train_correct = 0, 0
    net.train()
    for inputs, targets in train_loader:
        # data is a list of [inputs, labels]
        inputs, targets = inputs.to(device), targets.to(device)
        ## care! net expect a 784 size vector and our dataset provide 1x28x28 = Reshape!
        inputs = inputs.view(inputs.size(0), -1)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        _, pred = outputs.max(1)  # get the index of the max log-probability
        train_correct += pred.eq(targets).sum().item()

        # print statistics
        train_loss += loss.item()
        
    train_loss /= len(train_loader.dataset)

    # TEST NETWORK
    net.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            ## care! net expect a 784 size vector and our dataset provide 1x28x28 = Reshape!
            inputs = inputs.view(inputs.size(0), -1)
            outputs = net(inputs)
            test_loss += criterion(outputs, targets)
            _, pred = outputs.max(1)  # get the index of the max log-probability
            correct += pred.eq(targets).sum().item()

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100. * correct / len(test_loader.dataset)
    print("[Epoch {}] Train Loss: {:.6f} - Test Loss: {:.6f} - Train Accuracy: {:.2f}% - Test Accuracy: {:.2f}%".format(epoch+1, train_loss, test_loss, 100. * train_correct / len(train_loader.dataset), test_accuracy))
    
    if test_accuracy>best_accuracy:
        best_accuracy = test_accuracy
    
print("Finished Training")
print("Best Test accuracy: {:.2f}".format(best_accuracy))


---- Start Training ----
[Epoch 1] Train Loss: 0.002258 - Test Loss: 0.000963 - Train Accuracy: 93.26% - Test Accuracy: 97.18%
[Epoch 2] Train Loss: 0.001033 - Test Loss: 0.000739 - Train Accuracy: 96.84% - Test Accuracy: 97.70%
[Epoch 3] Train Loss: 0.000745 - Test Loss: 0.000654 - Train Accuracy: 97.69% - Test Accuracy: 98.11%
[Epoch 4] Train Loss: 0.000581 - Test Loss: 0.000557 - Train Accuracy: 98.13% - Test Accuracy: 98.28%
[Epoch 5] Train Loss: 0.000476 - Test Loss: 0.000588 - Train Accuracy: 98.48% - Test Accuracy: 98.20%
[Epoch 6] Train Loss: 0.000416 - Test Loss: 0.000653 - Train Accuracy: 98.62% - Test Accuracy: 98.22%
[Epoch 7] Train Loss: 0.000353 - Test Loss: 0.000485 - Train Accuracy: 98.83% - Test Accuracy: 98.57%
[Epoch 8] Train Loss: 0.000284 - Test Loss: 0.000554 - Train Accuracy: 99.05% - Test Accuracy: 98.44%
[Epoch 9] Train Loss: 0.000272 - Test Loss: 0.000490 - Train Accuracy: 99.12% - Test Accuracy: 98.52%
[Epoch 10] Train Loss: 0.000250 - Test Loss: 0.000516 - 