1. Compute the gradient and Hessians of the loss function for linear regression with L2 regularization.

We take the derivative of the loss function with respect to $w'$ and get:
$$ \frac{\partial f(w')}{\partial w'} = \frac{1}{n} \sum_{i=1}^{n} -y_i x_i \frac{\exp(-y_i w^T x_i)}{1 + \exp(-y_i w^T x_i)} + 2\lambda w$$

Hence, the gradient is equal to:
$$ \nabla_{w'} f(w') = \frac{1}{n} \sum_{i=1}^{n} -y_i x_i \frac{\exp(-y_i w^T x_i)}{1 + \exp(-y_i w^T x_i)} + 2\lambda w$$

We then take the derivative of the gradient with respect to $w'$ and get:
$$ \frac{\partial^2 f(w')}{\partial w'^2} = \frac{\partial \nabla_{w'} f(w')}{\partial w'} = \frac{1}{n} \sum_{i=1}^{n} x_i x_i^T \frac{\exp(-y_i w^T x_i)}{(1 + \exp(-y_i w^T x_i))^2} + 2\lambda I$$

Hence, the Hessians is equal to:
$$\nabla^2_{w'} f(w') = \frac{1}{n} \sum_{i=1}^{n} x_i x_i^T \frac{\exp(-y_i w^T x_i)}{(1 + \exp(-y_i w^T x_i))^2} + 2\lambda I$$

In [17]:
#2. Empirically verify the correctness
import torch
import sklearn
from sklearn import datasets

#import breast cancer dataset
brest_cancer = datasets.load_breast_cancer()
X = brest_cancer.data
y = brest_cancer.target
y = y.reshape(-1,1)
#convert numpy array to torch tensor
X = torch.from_numpy(X).float()
y = torch.from_numpy(y).float()

#init tensor w as a vector
omega = torch.ones(X.shape[1],1,requires_grad=True) #w = [1...1]
bias = torch.ones(1,1,requires_grad=True)       #w0 = 1

#evaluate the gradients of f in part 1
def f(X, y, omega, bias):
    ll = 1
    #compute the loss
    loss = torch.sum(torch.log(1 + torch.exp(-y * (torch.matmul(X, omega) + bias))))*1/X.shape[0] + ll*
    torch.sum(omega**2)^2
    return loss
f(X, y, omega, bias).backward()

myOmegaGrad = torch.autograd.grad(f(X, y, omega, bias), omega, create_graph=True)[0]
myBiasGrad = torch.autograd.grad(f(X, y, omega, bias), bias, create_graph=True)[0]
print("myOmegaGrad: ", myOmegaGrad)
print("myBiasGrad: ", myBiasGrad)


myOmegaGrad:  tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], grad_fn=<MmBackward0>)
myBiasGrad:  tensor([[0.]], grad_fn=<SumBackward1>)


In [3]:


#import all the class and download/transform the data set
import torch
import sklearn

from sklearn.datasets import load_breast_cancer

from torch.utils.data import Dataset
from torch.utils.data import DataLoader, Subset
# TorchVision functions for dealing with vision data sets
from torchvision import datasets
import torchvision.transforms as T

# Matplotlib for visualization
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid

# use a GPU if it is present
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

## Download and load our train and test data sets, transforming them as we load for convenience

training_data = datasets.FashionMNIST(
	root = "data",
	train = True,
	download = True,
    transform = T.Compose([T.ToTensor(), T.Lambda(torch.flatten)]) # converts the images to tensors [1, 28, 28], then flatten them to vectors [1, 784]
)

testing_data = datasets.FashionMNIST(
	root = "data",
	train = False,
	download = True,
    transform = T.Compose([T.ToTensor(), T.Lambda(torch.flatten)])
)

# Extract class 0 and class 1 since we are doing binary classification 

train_idx = torch.where(training_data.targets < 2)[0]
train_subset = Subset(training_data, train_idx) # a PyTorch convenience function for extracting specific data points

test_idx = torch.where(testing_data.targets < 2)[0]
test_subset = Subset(testing_data, test_idx)

## Convert these PyTorch data sets into X, y matrix vector pairs

# DataLoaders are very convenient Python iterators for when we need to deal with shuffled 
# minibatches over multiple epochs, and our data can't all fit in memory at once. 
# Here we will not really use their features. We just use them to load all our training and 
# test data into the matrices
train_dataloader = DataLoader(train_subset, batch_size=len(train_subset), shuffle=False)
test_dataloader = DataLoader(test_subset, batch_size=len(test_subset), shuffle=False)

Xtrain, ytrain= next(iter(train_dataloader))
Xtrain = Xtrain.to(device) # send the data to the device we're using
ytrain = ytrain.apply_(lambda c: 2*c - 1).view((len(train_subset), 1)).to(device) # convert {0,1} labels to {1,-1} labels

Xtest, ytest= next(iter(test_dataloader))
Xtest = Xtest.to(device)
ytest = ytest.apply_(lambda c: 2*c - 1).view((len(test_subset), 1)).to(device)

Using cpu
