In [12]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
from scipy.special import xlogy

## Forward-Propagation

$$
X \rightarrow Z=W_1X \rightarrow U = W_2Z \rightarrow S=F_{softmax}(U) \rightarrow L(S, y) = \log S_y,
$$

where $S_y=\frac{\exp(U_y)}{\sum_{j=0}^{K-1}\exp(U_j)}$ is the y-th element of the $S$ and $U_y$ is the y-th element of the $U$.

## Backward-Propagation

$$
\frac{\partial L}{\partial U_t} =  
\begin{cases}
S_t(U), & t\neq y \\
1- S_t(U). & t = y
\end{cases}
\Longrightarrow
\frac{\partial L}{\partial U} = e_y - S(U), 
$$

where $e_y$ is the unit vector, which y-th coordinate equals to 1 and 0 elsewhere. 

\begin{align}
& \frac{\partial L}{\partial W_2} = \frac{\partial L}{\partial U}\frac{\partial U}{\partial W_1} = (e_y - S(U))Z^T \\
& \frac{\partial L}{\partial W_1} =  \big(\frac{\partial L}{\partial W_2} \cdot Z\big)X^T
\end{align}

In [35]:
class DNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, hidden_layer_sizes=(100), solver='sgd',
                 batch_size=1, learning_rate=0.001, momentum=0.9, eps=1e-8,
                 max_iter=200, random_state=32, verbose=False):
        self.batch_size = batch_size
        self.max_iter = max_iter
        self.hidden_layer_sizes = hidden_layer_sizes
        self.momentum = momentum
        self.eps = eps
        self.learning_rate = learning_rate
        self.random_state = random_state
        self.verbose = verbose

    def __stable_softmax(self, X):
        tmp = X - X.max(axis=1)[:, np.newaxis]
        np.exp(tmp, out=X)
        X /= X.sum(axis=1)[:, np.newaxis]
        return X

    def __crossentropy_loss(self, y_true, y_prob):
        mask = y_prob > 0
        div = 1.0/float(y_prob.shape[0])
        y_true = y_true[mask]
        y_prob = y_prob[mask]
        return - xlogy(y_true, y_prob).sum() * div

    def __forward_layer(self, x, w, activation_function):
        out = np.dot(x, w)
        if activation_function is not None:
            out = activation_function(out)
        return out

    def __forward_propagate(self, x):
        network = self.weights
        activations = [x]
        for in_layer, activataion in zip(network, self.functions):
            out = self.__forward_layer(activations[-1], in_layer, activataion)
            activations.append(out)
        return activations

    def __back_propagation(self, activations, y):
        network = self.weights
        coef_grads = [np.empty_like(a_layer) for a_layer in network]

        for i in range(len(network)-1, -1, -1):
            deltas = activations[-1] - y if i == len(network)-1 else np.dot(deltas, network[i + 1].T)
            coef_grads[i] = np.dot(activations[i].T, deltas)

        return coef_grads

    def _init_layer(self, input_size, output_size):
        w = np.random.randn(input_size, output_size)
        return w

    def fit(self, X, y, shuffle=False):
        np.random.seed(self.random_state)

        self._label_binarizer = LabelBinarizer()
        y_train = y
        X_train = X
        y = self._label_binarizer.fit_transform(y)
        self._num_classes = len(self._label_binarizer.classes_)

        n, p = X.shape
        s = self.hidden_layer_sizes[0]

        weights = [
            self._init_layer(p, s),
            self._init_layer(s, self._num_classes)
        ]

        self.functions = [
            None,
            self.__stable_softmax,
        ]
        self.weights = weights
        accum_grad = [np.zeros_like(param) for param in weights]

        for j in range(self.max_iter):
            accumulated_loss = 0.0

            if shuffle:
                indices = np.arange(n)
                np.random.shuffle(indices)
                X = X.take(indices, axis=0)
                y = y.take(indices, axis=0)
            
            for i in range(0, n, self.batch_size):
                X_batch = X[i : i + self.batch_size]
                y_batch = y[i : i + self.batch_size]

                activations = self.__forward_propagate(X_batch)

                y_prob = activations[-1]

                accumulated_loss += self.__crossentropy_loss(y_batch, y_prob)
                coef_grads = self.__back_propagation(activations, y_batch)

                # update weights (adagrad method)
                coef_grads = [grad / self.batch_size for grad in coef_grads]
                accum_grad = [accum + grad**2 for accum, grad in zip(accum_grad, coef_grads)]
                inv_accum_grad = [self.learning_rate / np.sqrt(self.eps + accum) for accum in accum_grad]
                self.weights = [weight - inv_accum * grad for weight, inv_accum, grad in zip(self.weights, inv_accum_grad, coef_grads)]

            if self.verbose:
                loss = accumulated_loss / X.shape[0]
                y_pred = self.predict(X_train)
                accuracy = (y_pred == y_train).mean()
                print("Epoch {}/{};\t Train accuracy: {:.3f} \t Loss : {:.3f}".format(j + 1, self.max_iter, accuracy, loss))

        return self

    def predict(self, X):
        activations = self.__forward_propagate(X)
        y_pred = activations[-1]
        return self._label_binarizer.inverse_transform(y_pred)


## Read datasets

In [23]:
from sklearn.datasets import fetch_mldata

data_train = pd.read_csv("../dataset/mldata/mnist_train.csv", header=None)
data_test = pd.read_csv("../dataset/mldata/mnist_test.csv", header=None)

x_train = np.ascontiguousarray(data_train[data_train.columns[:-1]].values, dtype=np.float32)
y_train = np.ascontiguousarray(data_train[data_train.columns[-1]].values, dtype=np.float32)
x_test = np.ascontiguousarray(data_test[data_test.columns[:-1]].values, dtype=np.float32)
y_test = np.ascontiguousarray(data_test[data_test.columns[-1]].values, dtype=np.float32)

print('train size: ', x_train.shape, y_train.shape)
print('test size: ', x_test.shape, y_test.shape)

train size:  (60000, 784) (60000,)
test size:  (10000, 784) (10000,)


## Grid search parameters for Neural Network

In [24]:
from sklearn.model_selection import GridSearchCV

In [53]:
parameters = {
    'hidden_layer_sizes': [(64,), (128,), (256,), (512,), (1024,), (2048,)],
    'learning_rate': [0.5, 0.1, 0.05],
    'batch_size': [256, 512, 1024]
}
estimator = DNNClassifier(hidden_layer_sizes=(32, ), solver='adagrad',
     batch_size=256, learning_rate=0.1, max_iter=20,
     random_state=777, verbose=False)

clf = GridSearchCV(estimator, parameters, cv=5, scoring='accuracy')
clf.fit(x_train, y_train)
print("Лучший подбор параметра для DNNClassifier: {}".format(clf.best_params_))
print("Лучший scope для DNNClassifier: {}".format(clf.best_score_))

KeyboardInterrupt: 

In [49]:
best_estimator = clf.best_estimator_
up_params = {'verbose': True, 'max_iter': 30}
best_estimator.set_params(**up_params)
best_estimator

DNNClassifier(batch_size=256, eps=1e-08, hidden_layer_sizes=(64,),
       learning_rate=0.5, max_iter=50, momentum=0.9, random_state=777,
       solver=None, verbose=True)

In [50]:
best_estimator.fit(x_train, y_train)

Epoch 1/50;	 Train accuracy: 0.903 	 Loss : 0.008
Epoch 2/50;	 Train accuracy: 0.908 	 Loss : 0.014
Epoch 3/50;	 Train accuracy: 0.896 	 Loss : 0.022
Epoch 4/50;	 Train accuracy: 0.908 	 Loss : 0.030
Epoch 5/50;	 Train accuracy: 0.907 	 Loss : 0.034
Epoch 6/50;	 Train accuracy: 0.908 	 Loss : 0.039
Epoch 7/50;	 Train accuracy: 0.908 	 Loss : 0.042
Epoch 8/50;	 Train accuracy: 0.907 	 Loss : 0.049
Epoch 9/50;	 Train accuracy: 0.904 	 Loss : 0.051
Epoch 10/50;	 Train accuracy: 0.910 	 Loss : 0.056
Epoch 11/50;	 Train accuracy: 0.909 	 Loss : 0.059
Epoch 12/50;	 Train accuracy: 0.908 	 Loss : 0.062
Epoch 13/50;	 Train accuracy: 0.907 	 Loss : 0.067
Epoch 14/50;	 Train accuracy: 0.908 	 Loss : 0.068
Epoch 15/50;	 Train accuracy: 0.905 	 Loss : 0.071
Epoch 16/50;	 Train accuracy: 0.905 	 Loss : 0.073
Epoch 17/50;	 Train accuracy: 0.904 	 Loss : 0.075
Epoch 18/50;	 Train accuracy: 0.901 	 Loss : 0.076
Epoch 19/50;	 Train accuracy: 0.905 	 Loss : 0.078
Epoch 20/50;	 Train accuracy: 0.902 	 Lo

DNNClassifier(batch_size=256, eps=1e-08, hidden_layer_sizes=(64,),
       learning_rate=0.5, max_iter=50, momentum=0.9, random_state=777,
       solver=None, verbose=True)

## Accuracy for testing datasets

In [51]:
y_pred = best_estimator.predict(x_test)
print((y_pred == y_test).mean())

0.8812
