## Using Adagrad

In [5]:
from __future__ import print_function

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)

In [18]:
# Parameters
epochs = 9
batch_sizes = [64, 32]
learning_rates = [0.005, 0.01, 0.05, 0.1]
lr_decays = [0]
weight_decay = [0]

In [19]:
def train(train_loader):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
    return test(train_loader)


def test(test_loader):
    # TODO: Test the model on the test-set and report the loss and accuracy.
    correct = 0
    total = 0
    total_loss = 0
    batch_count = 0
    for data in test_loader:
        images, labels = data
        #images, labels = Variable(images), Variable(labels)
        output = model(Variable(images))
        
        loss = F.nll_loss(output, Variable(labels))
        #print('Loss: {0}'.format(loss.data[0]))
        total_loss += loss.data[0]
        batch_count += 1
        _, prediction = torch.max(output.data, 1)
        batch_corr = (prediction == labels).sum()
        #print('Batch correct: {0}, Batch size: {1}'.format(batch_corr, labels.size(0)))
        correct += batch_corr
        total += labels.size(0)

    acc = 100*correct/total
    #print('Accuracy: {:.2f}'.format(acc))
    error = 100-acc
    avg_loss = total_loss/batch_count
    #print('Avg Test loss for epoch {0}: {1}'.format(epoch, avg_loss))
    return avg_loss, error

In [20]:
for batch_size in batch_sizes:
    for lr in learning_rates:
        for lr_decay in lr_decays:
            for decay in weight_decay:
                seed = 1
                log_interval = 10
                test_batch_size = 1000
                torch.manual_seed(seed)

                train_loader = torch.utils.data.DataLoader(
                            datasets.MNIST('../data', train=True, download=True,
                                           transform=transforms.Compose([
                                               transforms.ToTensor(),
                                               transforms.Normalize((0.1307,), (0.3081,))
                                           ])),
                            batch_size=batch_size, shuffle=True)
                test_loader = torch.utils.data.DataLoader(
                            datasets.MNIST('../data', train=False, transform=transforms.Compose([
                                               transforms.ToTensor(),
                                               transforms.Normalize((0.1307,), (0.3081,))
                                           ])),
                            batch_size=test_batch_size, shuffle=True)
                        
                # Initialize Model and optimizer
                model = Net()
                optimizer = optim.Adagrad(model.parameters(), lr = lr, lr_decay = lr_decay, 
                                              weight_decay = decay)
                        

                for epoch in range(1, epochs + 1):
                    train_loss, train_error = train(train_loader)
                    if train_loss < 0.13:
                        break
                    #test_loss.append(test())
                test_loss, test_error = test(test_loader)
                print('Final epoch: {0}, train loss: {7}, train error: {1}, test error: {2} for parameters: batch_size = {3}, lr = {4}, lr_decay={5}, weight_decay = {6}'.format(epoch,
                train_error, test_error, batch_size, lr, lr_decay, decay, train_loss))
                        



Final epoch: 9, train loss: 0.24138649415645771, train error: 6.983333333333334, test error: 6.680000000000007 for parameters: batch_size = 64, lr = 0.005, lr_decay=0, weight_decay = 0
Final epoch: 9, train loss: 0.1806037626795169, train error: 5.028333333333336, test error: 4.989999999999995 for parameters: batch_size = 64, lr = 0.01, lr_decay=0, weight_decay = 0
Final epoch: 9, train loss: 0.4293162641621856, train error: 13.946666666666673, test error: 13.25 for parameters: batch_size = 64, lr = 0.05, lr_decay=0, weight_decay = 0
Final epoch: 9, train loss: 0.21542659404673684, train error: 6.5049999999999955, test error: 6.329999999999998 for parameters: batch_size = 64, lr = 0.1, lr_decay=0, weight_decay = 0


KeyboardInterrupt: 

## Observations:

- It is found that Adagrad is easier to do parameter optimization as it has lesser parameters to deal with. 
- The best parameters are batch_size = 64, lr = 0.01, lr_decay=0, weight_decay = 0