# 1 Loading the Dataset

In [4]:
import numpy as np
from sklearn.datasets import load_digits
digits = load_digits()
print(digits.keys())
data = digits["data"]
images = digits["images"]
target = digits["target"]
target_names = digits["target_names"]
print(data.shape)

dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])
(1797, 64)


Extract instances showing "3" or "8", append a column of "1s" and create a vector of ground-truth labels where 1 corresponds to 3 and -1 to 8.

In [79]:
X = data[np.logical_or(target == 3, target == 8)]
y = target[np.logical_or(target == 3, target == 8)]

X = np.concatenate((X, np.ones([len(y),1])), axis = 1)

y[y==3] = 1
y[y==8] = -1

# 1.1 Classification with sklearn

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split

In [49]:
num_splits = 10
lambdas = [0.001,0.01,0.1,1,10,100,1000]
scores = np.zeros([len(lambdas), 2]) # save mean score and std for each lambda

for i, C in enumerate(lambdas):
    logistic = LogisticRegression(C = C)
    curr_scores = cross_val_score(logistic, X, y, cv = num_splits)
    scores[i,0] = np.average(curr_scores)
    scores[i,1] = np.std(curr_scores)

In [47]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format

In [50]:
display(pd.DataFrame(
        data = scores,
        index = lambdas,
        columns = ['mean','std'])
        .rename_axis('C', axis = 'columns'))

C,mean,std
0.001,0.9688,0.0531
0.01,0.9691,0.0464
0.1,0.986,0.0288
1.0,0.986,0.0288
10.0,0.9803,0.0283
100.0,0.9775,0.0304
1000.0,0.9747,0.0341


The accuracy of the prediction varies only very slowly when modifying the regularization parameter. The best accuracy is obtained for $\lambda=0.1, 1, 10$, thus a very large range. In the following we will proceed using $\lambda=1$.

# 1.2 Optimization Methods

In [51]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [64]:
def predict(beta, X):
    # if X*beta > 0 --> y=1, if X*beta < 0 --> y=-1
    y = (X.dot(beta) >= 0) * 2 - 1
    return y

In [87]:
def zero_one_loss(y_prediction, y_truth):
    return np.sum(np.not_equal(y_prediction, y_truth))

In [70]:
def gradient(beta, X, y, lambda_ = 1):
    grad = beta / lambda_ - np.average((sigmoid(-X.dot(beta) * y) * y)[:,None] * X, axis = 0)
    return grad

In [71]:
def GD(X, y, beta, tau, m):
    for _ in range(m):
        beta = beta - tau * gradient(beta, X, y)
    return beta

In [None]:
def SGD

In [89]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.3 ,random_state = 0)
beta = np.zeros(65)
beta = GD(X_train, y_train, beta, 0.0001, 10)

errors = zero_one_loss(predict(beta, X_test), y_test)
print(errors / len(y_test))

0.0185185185185
