# 1 Loading the Dataset

In [1]:
import numpy as np
from sklearn.datasets import load_digits
digits = load_digits()
print(digits.keys())
data = digits["data"]
images = digits["images"]
target = digits["target"]
target_names = digits["target_names"]
print(data.shape)

dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])
(1797, 64)


Extract instances showing "3" or "8", append a column of "1s" and create a vector of ground-truth labels where 1 corresponds to 3 and -1 to 8.

In [2]:
X = data[np.logical_or(target == 3, target == 8)]
y = target[np.logical_or(target == 3, target == 8)]

X = np.concatenate((X, np.ones([len(y),1])), axis = 1)

y[y==3] = 1
y[y==8] = -1

# 1.1 Classification with sklearn

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split

In [5]:
num_splits = 10
lambdas = [0.001,0.01,0.1,1,10,100,1000]
scores = np.zeros([len(lambdas), 2]) # save mean score and std for each lambda

for i, C in enumerate(lambdas):
    logistic = LogisticRegression(C = C)
    curr_scores = cross_val_score(logistic, X, y, cv = num_splits)
    scores[i,0] = np.average(curr_scores)
    scores[i,1] = np.std(curr_scores)

In [6]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format

In [7]:
display(pd.DataFrame(
        data = scores,
        index = lambdas,
        columns = ['mean','std'])
        .rename_axis('C', axis = 'columns'))

C,mean,std
0.001,0.9688,0.0531
0.01,0.9691,0.0464
0.1,0.986,0.0288
1.0,0.986,0.0288
10.0,0.9803,0.0283
100.0,0.9775,0.0304
1000.0,0.9747,0.0341


The accuracy of the prediction varies only very slowly when modifying the regularization parameter. The best accuracy is obtained for $\lambda=0.1, 1, 10$, thus a very large range. In the following we will proceed using $\lambda=1$.

# 1.2 Optimization Methods

In [4]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [5]:
def predict(beta, X):
    # if X*beta > 0 --> y=1, if X*beta < 0 --> y=-1
    y = (X.dot(beta) >= 0) * 2 - 1
    return y

In [6]:
def zero_one_loss(y_prediction, y_truth):
    return np.sum(np.not_equal(y_prediction, y_truth))

In [7]:
def gradient(beta, X, y, lambda_ = 1):
    # distinguish the cases of one single / several training instances
    if np.isscalar(y):
        grad = beta / lambda_ - sigmoid(-X.dot(beta) * y) * y * X
    else:
        grad = beta / lambda_ - np.average((sigmoid(-X.dot(beta) * y) * y)[:,None] * X, axis = 0)
    return grad

In [8]:
def GD(X, y, beta, tau, m):
    for _ in range(m):
        beta = beta - tau * gradient(beta, X, y)
    return beta

In [45]:
from sklearn.utils import shuffle
def SGD_with_replacement(X, y, beta, tau_0, gamma, m):
    N = y.shape[0]
    for i in range(m):
        tau = tau_0 / (1 + gamma * i)
        index = np.random.randint(low=0, high=N)
        beta = beta - tau * gradient(beta, X[index,:], y[index])
    return beta

def SGD_without_replacement(X, y, beta, tau_0, gamma, m):
    X, y = shuffle(X,y)
    for i in range(m):
        tau = tau_0 / (1 + gamma * i)
        beta = beta - tau * gradient(beta, X[i,:], y[i])
    return beta

In [46]:
def SG_minibatch(X, y, beta, B, tau_0, gamma, m):
    # here we do not replace the samples since there would not be enough samples
    # to perform m=150 optimization steps for larger B
    for i in range(m):
        X, y = shuffle(X,y)
        tau = tau_0 / (1 + gamma * i)
        beta = beta - tau * gradient(beta, X[:B,:], y[:B])
    return beta

In [47]:
def SG_momentum(X, y, beta, tau_0, gamma, mu, m):
    X, y = shuffle(X,y)
    # initialize g
    g = np.zeros(len(beta))
    for i in range(m):
        tau = tau_0 / (1 + gamma * i)
        g = mu * g + (1 - mu) * gradient(beta, X[i,:], y[i])
        beta = beta - tau * g
    return beta

In [48]:
def ADAM(X, y, beta, m, tau=10**-4, mu1=0.9, mu2=0.999, eps=10**-8):
    X, y = shuffle(X,y)
    # initialize g, q
    g = np.zeros(len(beta))
    q = np.zeros(len(beta))
    for i in range(m):
        grad = gradient(beta, X[i,:], y[i])
        g = mu1 * g + (1 - mu1) * grad
        q = mu2 * q + (1 - mu2) * np.square(grad)
        beta = beta - tau / (np.sqrt(q) + eps) * g
    return beta

In [56]:
def SAGD(X, y, beta, m, tau_0, gamma):
    g_stored = -y[:,None] * X * sigmoid(-y * X.dot(beta))[:,None]
    g = np.average(g_stored, axis=0)
    X, y = shuffle(X, y)
    for i in range(m):
        tau = tau_0 / (1 + gamma * i)
        g[i] = -y[i] * np.dot(X[i,:].T, sigmoid(-y[i] * np.dot(X[i,:], beta)))
        g = g + 1 / N * (g[i])


In [49]:
def Newton_Raphson(X, y, beta, m, lambda_=0.001):# hier muss lambda kleiner gewählt werden, für lambda=1 divergiert das ganze
    N,D = X.shape
    for _ in range(m):
        z = X.dot(beta)
        y_tilde = y / sigmoid(-y * z)
        W = np.diag(lambda_ / N * sigmoid(z) * sigmoid(-z))
        inv = np.linalg.inv(1 / lambda_ * np.identity(D) + np.dot(X.T, np.dot(W, X)))
        beta = beta + np.dot(inv, np.dot(X.T, np.dot(W, y_tilde)) - beta / lambda_)
    return beta

In [50]:
def dual_coordinate_ascent(X, y, m, lambda_=0.0001):
    # initialization:
    N = X.shape[0]
    alpha = np.random.uniform(size=N)
    beta = lambda_ * np.average((alpha * y)[:,None] * X, axis=0)
    X, y = shuffle(X, y)
    for i in range(m):
        f_prime = y[i] * np.dot(X[i,:], beta) + np.log(alpha[i] / (1 - alpha[i]))
        f_2prime = lambda_ / N * np.dot(X[i,:], X[i,:].T) + 1 / (alpha[i] * (1 - alpha[i]))
        alpha_old = alpha[i].copy()
        alpha[i] = max(0, min(1, alpha[i] - f_prime / f_2prime))
        beta += lambda_ / N * y[i] * X[i,:] * (alpha[i] - alpha_old)
    return beta

In [51]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3 ,random_state = 0)
beta = dual_coordinate_ascent(X_train, y_train, 150)

errors = zero_one_loss(predict(beta, X_test), y_test)
print(errors / len(y_test))

0.0185185185185


In [18]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3 ,random_state = 0)
beta = np.zeros(65)
beta = GD(X_train, y_train, beta, 0.001, 10)

errors = zero_one_loss(predict(beta, X_test), y_test)
print(errors / len(y_test))

0.0185185185185


In [19]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3 ,random_state = 0)
beta = np.zeros(65)
beta = Newton_Raphson(X_train, y_train, beta, 20)

errors = zero_one_loss(predict(beta, X_test), y_test)
print(errors / len(y_test))

0.0185185185185


In [69]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3 ,random_state = 0)
beta = np.zeros(65)
tau_0 = 0.001
gamma = 0.0001
beta = SGD_without_replacement(X_train, y_train, beta, tau_0, gamma, 150)

errors = zero_one_loss(predict(beta, X_test), y_test)
print(errors / len(y_test))

0.00925925925926


In [53]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3 ,random_state = 0)
beta = np.zeros(65)
tau_0 = 0.001
gamma = 0.0001
beta = SG_minibatch(X_train, y_train, beta, 1, tau_0, gamma, 150)

errors = zero_one_loss(predict(beta, X_test), y_test)
print(errors / len(y_test))

0.0185185185185


In [27]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3 ,random_state = 0)
beta = np.zeros(65)
tau_0 = 0.001
gamma = 0.0001
mu = 0.1
beta = ADAM(X_train, y_train, beta, 150)

errors = zero_one_loss(predict(beta, X_test), y_test)
print(errors / len(y_test))

0.00925925925926


In [57]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3 ,random_state = 0)
beta = np.zeros(65)
tau_0 = 0.001
gamma = 0.0001
mu = 0.1
beta = SAGD(X_train, y_train, beta, 150)

errors = zero_one_loss(predict(beta, X_test), y_test)
print(errors / len(y_test))

TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

# 1.3 Comparison

In [64]:
from sklearn.cross_validation import KFold
import itertools

In [71]:
X_tr, X_test, y_tr, y_test = train_test_split (X, y, test_size = 0.3 ,random_state = 0)

for (tau_0, mu, gamma) in itertools.product([0.001, 0.01, 0.1], [0.1, 0.2, 0.5], [0.0001, 0.001, 0.01]):
    kf = KFold(y_tr.shape[0], n_folds=10)
    errors = 0
    for train, val in kf:
        beta = SGD_without_replacement(X_tr[train], y_tr[train], beta, tau_0, gamma, 150)
        errors += zero_one_loss(predict(beta, X_tr[val]), y_tr[val])
    print(tau_0, mu, gamma, errors)
        

0.001 0.1 0.0001 3
0.001 0.1 0.001 4
0.001 0.1 0.01 1
0.001 0.2 0.0001 4
0.001 0.2 0.001 1
0.001 0.2 0.01 3
0.001 0.5 0.0001 7
0.001 0.5 0.001 1
0.001 0.5 0.01 1
0.01 0.1 0.0001 36
0.01 0.1 0.001 35
0.01 0.1 0.01 25
0.01 0.2 0.0001 57
0.01 0.2 0.001 26
0.01 0.2 0.01 13
0.01 0.5 0.0001 29
0.01 0.5 0.001 24
0.01 0.5 0.01 9
0.1 0.1 0.0001 85
0.1 0.1 0.001 92
0.1 0.1 0.01 59
0.1 0.2 0.0001 134
0.1 0.2 0.001 105
0.1 0.2 0.01 75
0.1 0.5 0.0001 81
0.1 0.5 0.001 75
0.1 0.5 0.01 55
