In [1]:
from sklearn.datasets import fetch_openml
import numpy as np
np.random.seed(42)

In [8]:
def relu(x):
    return np.maximum(0, x)


def relu_derivative(x):
    return (x > 0).astype(float)


class DenseLayer:
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(input_size, output_size) * 0.01
        self.biases = np.zeros((1, output_size))

    def forward(self, input):
        self.input = input
        return np.dot(input, self.weights) + self.biases

    def backward(self, output_gradient, learning_rate):
        weights_gradient = np.dot(self.input.T, output_gradient)
        input_gradient = np.dot(output_gradient, self.weights.T)
        self.weights -= learning_rate * weights_gradient
        self.biases -= learning_rate * \
            np.sum(output_gradient, axis=0, keepdims=True)
        return input_gradient


def softmax(logits):
    """Compute softmax values for each set of scores in logits."""
    exp_scores = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

In [11]:
# 加载MNIST数据
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]

In [10]:
class MultiLayerNetwork:
    def __init__(self, layer_sizes):
        self.layers = []
        for i in range(len(layer_sizes) - 1):
            self.layers.append(DenseLayer(layer_sizes[i], layer_sizes[i+1]))

    def forward(self, input_data):
        self.activations = [input_data]
        x = input_data
        for layer in self.layers:
            x = layer.forward(x)
            x = relu(x)  # Apply ReLU activation function
            self.activations.append(x)
        return x

    def backward(self, y_true, learning_rate):
        if np.isscalar(y_true):
            # Make y_true an array if it's a scalar
            y_true = np.array([y_true])

        n_classes = self.activations[-1].shape[1]
        y_true_one_hot = np.zeros((y_true.size, n_classes))
        y_true_one_hot[np.arange(y_true.size), y_true] = 1

        last_activation = self.activations[-1]
        y_pred = softmax(last_activation)
        output_gradient = (y_pred - y_true_one_hot) / y_true.shape[0]

        for i in reversed(range(len(self.layers))):
            output_gradient = relu_derivative(
                self.activations[i+1]) * output_gradient
            output_gradient = self.layers[i].backward(
                output_gradient, learning_rate)

    def compute_loss(self, x_train, y_train):
        loss = 0
        for x, y in zip(x_train, y_train):
            y_pred = softmax(self.forward(x.reshape(1, -1)))
            y_true_one_hot = np.zeros((1, y_pred.shape[1]))
            y_true_one_hot[0, y] = 1
            loss += -np.sum(y * np.log(y_pred))
        return loss / len(x_train)

    def train(self, x_train, y_train, learning_rate, epochs):
        for epoch in range(epochs):
            for x, y in zip(x_train, y_train):
                x = x.reshape(1, -1)  # Reshape x to [1, num_features]
                self.forward(x)
                self.backward(y, learning_rate)
            # if epoch % 10 == 0:
            print(
                f'Epoch {epoch}, Loss: {self.compute_loss(x_train, y_train)}')

In [13]:
input_size = X.shape[1]
output_size = 10

layer_sizes = [input_size, 1024, 1024, 1024,
               1024, 1024, output_size]  # 六个隐藏层加输出层

# 生成数据（示例）

X_np = X.values
y_np = y.values.astype(int)
network = MultiLayerNetwork(layer_sizes)
network.train(X_np, y_np, learning_rate=0.01, epochs=1)