In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

from tqdm import tqdm #optional, if you do not want to import remove tqdm() from loops!

%load_ext autoreload
%autoreload 2
%matplotlib inline

# 1. Preprocessing

We again will be using the MNIST dataset. This time I prepared the dataset as a npy file. We will load the data visualize an example and the implement logistic regression.

In [None]:
# change if the file is in a different directory
f_features = "features.npy"

# change if the file is in a different directory
f_labels = "labels.npy"

# load the data
features=np.load(f_features)
labels=np.load(f_labels)

In [None]:
plt.imshow(features[0,:,:], cmap='gray')

In [None]:
# As you know we need to split the data into training, validation and test
x_train=features[0:4800,:,:]
x_train=x_train.reshape((4800, 784))
y_train=labels[0:4800].astype(int)

x_val=features[4800:5400,:,:]
x_val=x_val.reshape((600, 784))
y_val=labels[4800:5400].astype(int)

x_test=features[5400:6000,:,:]
x_test=x_test.reshape((600, 784))
y_test=labels[5400:6000].astype(int)

# Note: Normally the split has to be random and stratified for the validation set and random for the test set

In [None]:
plt.imshow(x_train[0].reshape(28,28), cmap='gray')

# 2. Logistic Regression

From the lecture we know that logistic regression is given by affined transformation of the data followed by applying the sigmoid function. Our first step is to implement the function we need.

In [None]:
def layer(x, w):
    """
    

    Parameters
    ----------
    x : feature tensor of dimension (N,M)
    w : learnable parameters of dimension (M+1, C)

    N is the number of samples, M the number of features and C the number of classes.

    Returns
    -------
    res : output tensor of dimension (N, C) 

    res should be the result of the matrix multiplication of an expanded feature tensor (1 column) with 
    the learnable parameters.

    """
    
    
    res=0
    # TODO
    # Implement the affine transformation
    # Put your code here:

    
    # END
    
    return res

In [None]:
# test your code

w=np.ones((28*28+1,10)) # 28*28 are the number of features and the bias leads to +1
res=layer(x_train, w)
res.shape

In [None]:
# If you need more function use the blocks below

## 2.1 Loss Function
In exercise sheet 0, we just guessed values, but now we are smarter! First we need to define an appropriate loss function. The dataset has ten target classes, so we want to implement cross-entropy loss:

$\mathcal{L}=\sum_{y}1\{\hat{y}=y\}(-\log[p(y)])$ 

The $p(y)$ is given by the softmax function

$p(y_i)=\frac{e^{x_i}}{\sum_ie^{x_i}}$

So the softmax should return a vector representing the probability of each class.

In [None]:
def softmax(y):
    """
    

    Parameters
    ----------
    y : Prediction tensor of dimension (N, C). 

    Returns
    -------
    res : Softmax transformed tensor of dimension (N,C)

    res should be the result of the softmax transformation of y.

    """
    
    res=0
    # TODO 
    # Implement the softmax function
    # Put your code here:


    # END
    return res

In [None]:
def model(x, w):
    """
    
    Parameters
    ----------
    x : feature tensor of dimension (N,M)
    w : learnable parameters of dimension (M+1, C)

    N is the number of samples, M the number of features and C the number of classes.

    Returns
    -------
    res : Prediction tensor of dimension (N,1)

    res should be the classification of our model 

    """
    
    
    res=0
    # TODO
    # Put your code here:

    
    # END
    
    return res

In [None]:
# If you need more function use the blocks below

## 2.2 Optimization
We have already learned about optimization algorithms. In this exercise we want to learn more about gradient descent, stochastic gradient descent and Newton’s method.

### 2.2.1 Gradient Descent
For gradient descent we need to updates our parameters using the steepest descent of the gradient with respect to the parameter. It is given by the equation:

$w_{n+1}=w_n-\epsilon_n\nabla\mathcal{L(w_n)}$

$\epsilon_n$ is the learning rate and a hyperparameter of our optimization approach. We can calculate the gradient by using the composition rule for derivatives.

The challenge is to broadcast the right dimensions!

In [None]:
def classTensor(y, C):
    """
    
    Parameters
    ----------
    y : class vector of dimension N containing the true classes
    C : number of classes


    Returns
    -------
    res : class tensor of dimension (N,C)

    We want to transform the vector into a binary tensor. If res_ij=1 then it means that at sample i we have class j. Otherwise 
    res_ij=0.

    """  


    res=0
    # TODO
    # Implement the tensor transformation
    # Put your code here:

    
    # END

    return res

In [None]:
def gradientDescent(x, y, w, learningRate):
    """
    
    Parameters
    ----------
    x : feature tensor of dimension (N,M)
    y : class vector of dimension N containing the true classes
    w : learnable parameters of dimension (M+1, C)

    N is the number of samples, M the number of features and C the number of classes.

    Returns
    -------
    res : updated learnable parameters

    """
    res=0
    # TODO 
    # Implement one update iteration
    

    return res

In [None]:
# If you need more function use the blocks below

### 2.2.2 Stochastic Gradient Descent
In this section implement stochastic gradient descent by writing the function "def stochasticGradientDescent(...)"

In [None]:
# TODO
# Write the function stochasticGradientDescent

### 2.2.3 Newton’s method
In this section implement Newton's method by writing the function "def newtonMethod(...)"

In [None]:
# TODO
# Write the function newtonMethod

# 3. Training
Now use the MNIST dataset to train a classifier and compare the results.

# 3.1 Train and Plot Gradient Descent

In [None]:
acc_train=[]
acc_val=[]

n_epochs=100
learningRate=0.001

# Maybe you find a better way to initialize the learnable parameters
w=np.zeros((785,10))+1

for e in tqdm(range(n_epochs)):

    w=gradientDescent(x_train,y_train, w,learningRate)
    pred=model(x_train,w)

    acc_train.append(accuracy_score(y_train, pred))
    acc_val.append(accuracy_score(y_val, model(x_val,w)))

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))


ax.set_xlabel("Epochs")
ax.set_ylabel("Accuracy")

ax.plot(acc_val, c="orange", label="val", lw=1.5)
ax.plot(acc_train, c="blue", label="train",lw=1.5)
ax.grid(color='gray', linestyle='dashed', alpha=0.3)
ax.legend(loc="lower right", fontsize=11)

## 3.2 TODO: Implement training for stochastic gradient descent

## 3.3 TODO: Implement training for newton's method