In [1]:
# !python

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import matplotlib.pyplot as plt
#from torchsummary import summary
import time
import numpy as np
import cupy as cp
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")


In [2]:
torch.cuda.empty_cache()

In [3]:
# download and transform train dataset
train_loader = torch.utils.data.DataLoader(datasets.MNIST('../mnist_data', 
                                                          download=True, 
                                                          train=True,
                                                          transform=transforms.Compose([
                                                              transforms.ToTensor(), # first, convert image to PyTorch tensor
                                                              transforms.Normalize((0.1307,), (0.3081,)) # normalize inputs
                                                          ])), 
                                           batch_size=10, 
                                           shuffle=True)

# download and transform test dataset
test_loader = torch.utils.data.DataLoader(datasets.MNIST('../mnist_data', 
                                                          download=True, 
                                                          train=False,
                                                          transform=transforms.Compose([
                                                              transforms.ToTensor(), # first, convert image to PyTorch tensor
                                                              transforms.Normalize((0.1307,), (0.3081,)) # normalize inputs
                                                          ])), 
                                           batch_size=10, 
                                           shuffle=True)

In [4]:
class CNNClassifier(nn.Module):
    """Custom module for a simple convnet classifier"""
    def __init__(self):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 5, kernel_size=5)
        self.conv2 = nn.Conv2d(5, 10, kernel_size=5)
        self.fc1 = nn.Linear(10, 10)
    
    def forward(self, x):
        # input is 28x28x1
        # conv1(kernel=5, filters=10) 28x28x10 -> 24x24x10
        # max_pool(kernel=2) 24x24x10 -> 12x12x10
        
        # Do not be afraid of F's - those are just functional wrappers for modules form nn package
        # Please, see for yourself - http://pytorch.org/docs/_modules/torch/nn/functional.html
        x = F.leaky_relu(F.max_pool2d(self.conv1(x), 2))
        
        # conv2(kernel=5, filters=20) 12x12x20 -> 8x8x20
        # max_pool(kernel=2) 8x8x20 -> 4x4x20
        x = F.leaky_relu(F.max_pool2d(self.conv2(x), 8))
        
        # flatten 4x4x20 = 320
        x = x.view(-1, 10)
        
        # 320 -> 50
        x = self.fc1(x)

        # transform to logits
        return F.log_softmax(x, dim=1)

In [5]:
# create classifier and optimizer objects
clf = CNNClassifier().to(device)
opt = optim.SGD(clf.parameters(), lr=0.01, momentum=0.5)
criterion = nn.NLLLoss().to(device)
loss_history = []
acc_history = []

def train(epoch):
    clf.train() # set model in training mode (need this because of dropout)
    
    # dataset API gives us pythonic batching 
    for batch_id, (data, label) in enumerate(train_loader):
        data = Variable(data).to(device)
        target = Variable(label).to(device)
        
        # forward pass, calculate loss and backprop!
        opt.zero_grad()
        preds = clf(data)
        loss = criterion(preds, target)
        loss.backward()
        loss_history.append(loss.item())
        opt.step()
        
        if batch_id % 1000 == 0:
            print(loss.item())

def test(epoch):
    with torch.no_grad():
        clf.eval()
        test_loss = 0
        correct = 0

        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)

            output = clf(data)
            test_loss += F.nll_loss(output, target).item()
            pred = output.data.max(1)[1] # get the index of the max log-probability
            correct += pred.eq(target.data).cpu().sum()

    test_loss = test_loss
    test_loss /= len(test_loader) # loss function already averages over batch size
    accuracy = 100. * correct / len(test_loader.dataset)
    acc_history.append(accuracy)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy))

In [5]:
if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        clf = nn.DataParallel(clf)

clf.to(device)

Let's use 2 GPUs!


DataParallel(
  (module): CNNClassifier(
    (conv1): Conv2d(1, 5, kernel_size=(5, 5), stride=(1, 1))
    (conv2): Conv2d(5, 10, kernel_size=(5, 5), stride=(1, 1))
    (fc1): Linear(in_features=10, out_features=10, bias=True)
  )
)

In [6]:
for epoch in range(0,3):
    print("Epoch %d" % epoch)
    train(epoch)
    test(epoch)

Epoch 0
2.2886264324188232
0.13259916007518768
0.06747128814458847
0.3153684139251709
0.10231061279773712
0.012948465533554554

Test set: Average loss: 0.1877, Accuracy: 9477/10000 (94%)

Epoch 1
0.03583803027868271
0.09544344246387482
0.0319838747382164
0.42813271284103394
0.04897008091211319
0.07742156833410263

Test set: Average loss: 0.1543, Accuracy: 9544/10000 (95%)

Epoch 2
0.03315744549036026
0.028505325317382812
0.009849119000136852
0.08729724586009979
0.07444243133068085
0.008196592330932617

Test set: Average loss: 0.1349, Accuracy: 9590/10000 (95%)



In [67]:
def eval_hessian(loss_grad, model):
    cnt = 0
    for g in loss_grad:
        g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)])
        cnt = 1
    l = g_vector.size(0)
    g_vector = g_vector.to(device)
    hessian = torch.zeros(l, l)
    for idx in range(l):
        grad2rd = torch.autograd.grad(g_vector[idx], model.parameters(), create_graph=True)
        cnt = 0
        for g in grad2rd:
            g2 = g.contiguous().view(-1) if cnt == 0 else torch.cat([g2, g.contiguous().view(-1)])
            cnt = 1
        hessian[idx] = g2
    return hessian

In [73]:
for batch_id, (data, label) in enumerate(train_loader):
    data = Variable(data).to(device)
    target = Variable(label).to(device)
    break

## Function validation

In [None]:
#simple qudratic form

In [6]:
A = torch.tensor([
    [2.,1.],
    [1.,2.]
], requires_grad=True)

x = torch.tensor([2., 3.], requires_grad=True)

f = .5*x@A@x

loss_grad, = torch.autograd.grad(f, x, create_graph=True)

cnt = 0
for g in loss_grad:
    g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)])
    cnt = 1
l = g_vector.size(0)
g_vector = g_vector.to(device)
hessian = torch.zeros(l, l)
for idx in range(l):
    grad2rd = torch.autograd.grad(g_vector[idx], x, create_graph=True)
    cnt = 0
    for g in grad2rd:
        g2 = g.contiguous().view(-1) if cnt == 0 else torch.cat([g2, g.contiguous().view(-1)])
        cnt = 1
    hessian[idx] = g2
    
print(A)
print(hessian)

tensor([[2., 1.],
        [1., 2.]], requires_grad=True)
tensor([[2., 1.],
        [1., 2.]], grad_fn=<CopySlices>)


In [7]:
#liner regression

In [8]:
N = 100
nb = 10
y = np.random.rand(N)
X = np.random.rand(N, nb)*2-1
w = np.random.rand(nb)
#b = np.random.rand(100)

In [9]:
X = torch.tensor(X, requires_grad=True)
w = torch.tensor(w, requires_grad=True)
#b = torch.tensor(b, requires_grad=True)
y = torch.tensor(y, requires_grad=True)

In [10]:
loss = torch.norm(X@w-y)**2

In [11]:
loss_grad, = torch.autograd.grad(loss, w, create_graph=True)

In [12]:
cnt = 0
for g in loss_grad:
    g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)])
    cnt = 1
l = g_vector.size(0)
g_vector = g_vector.to(device)
hessian = torch.zeros(l, l)
for idx in range(l):
    grad2rd = torch.autograd.grad(g_vector[idx], w, create_graph=True)
    cnt = 0
    for g in grad2rd:
        g2 = g.contiguous().view(-1) if cnt == 0 else torch.cat([g2, g.contiguous().view(-1)])
        cnt = 1
    hessian[idx] = g2

In [13]:
np.allclose(2*X.detach().numpy().T@X.detach().numpy(), hessian.detach().numpy())

True

In [14]:
#binary logistic regression
#https://www.cs.ox.ac.uk/people/nando.defreitas/machinelearning/lecture6.pdf

In [15]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

X = np.random.rand(10,2)*2-1
w = np.array([2., 3.])
y = (sigmoid(X@w+np.random.rand())>.5).astype('float')

In [16]:
X = torch.tensor(X, requires_grad=True)
w = torch.tensor(w, requires_grad=True)
#b = torch.tensor(b, requires_grad=True)
y = torch.tensor(y, requires_grad=True)

In [17]:
#binary cross entropy
loss = -torch.sum(
    torch.log(F.sigmoid(X@w))*y + 
    torch.log(1-F.sigmoid(X@w))*(1-y)
)



In [18]:
pi = F.sigmoid(X@w)
torch.transpose(X, 1, 0) @(pi-y)

tensor([ 0.8753, -1.0963], dtype=torch.float64, grad_fn=<MvBackward>)

In [19]:
loss_grad, = torch.autograd.grad(loss, w, create_graph=True)

In [20]:
loss_grad

tensor([ 0.8753, -1.0963], dtype=torch.float64, grad_fn=<ThAddBackward>)

In [21]:
cnt = 0
for g in loss_grad:
    g_vector = g.contiguous().view(-1) if cnt == 0 else torch.cat([g_vector, g.contiguous().view(-1)])
    cnt = 1
l = g_vector.size(0)
g_vector = g_vector.to(device)
hessian = torch.zeros(l, l)
for idx in range(l):
    grad2rd = torch.autograd.grad(g_vector[idx], w, create_graph=True)
    cnt = 0
    for g in grad2rd:
        g2 = g.contiguous().view(-1) if cnt == 0 else torch.cat([g2, g.contiguous().view(-1)])
        cnt = 1
    hessian[idx] = g2

In [22]:
pi = F.sigmoid(X@w)
torch.transpose(X, 1, 0) @ torch.diag(pi*(1-pi)) @ X

tensor([[ 0.6700, -0.3953],
        [-0.3953,  0.4963]], dtype=torch.float64, grad_fn=<MmBackward>)

In [23]:
hessian

tensor([[ 0.6700, -0.3953],
        [-0.3953,  0.4963]], grad_fn=<CopySlices>)

In [242]:
class LogisticRegression(torch.nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(2, 2, bias=False)
    def forward(self, x):
        y_pred = F.sigmoid(self.linear(x))
        return y_pred


In [243]:
X = np.random.rand(10,2)*2-1
w = np.array([2., 3.])
y = (sigmoid(X@w+np.random.rand()/10)>.5).astype('float')

X = Variable(torch.tensor(X).float())
#w = torch.tensor([w], requires_grad=True).type(torch.float32)
#b = torch.tensor(b, requires_grad=True)
y = Variable(torch.tensor(y).long())

model = LogisticRegression()
#model.linear.weight = nn.Parameter(w)