In [8]:
import numpy as np
import pandas as pd

# Load data
def load_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    X_train = train_data.iloc[:, 1:].values / 255.0  # Normalize
    y_train = train_data.iloc[:, 0].values
    X_test = test_data.iloc[:, 1:].values / 255.0    # Normalize
    y_test = test_data.iloc[:, 0].values
    return X_train, y_train, X_test, y_test

# One-hot encoding
def one_hot_encode(y, num_classes):
    one_hot = np.zeros((len(y), num_classes))
    one_hot[np.arange(len(y)), y] = 1
    return one_hot

# Activation functions and derivatives
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(x):
    exp_scores = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

# Initialize weights
def initialize_weights(input_size, hidden_size, output_size):
    W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
    b1 = np.zeros((1, hidden_size))
    W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
    b2 = np.zeros((1, output_size))
    return W1, b1, W2, b2

# Forward propagation
def forward_propagation(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

# Compute loss
def compute_loss(A2, Y):
    m = len(Y)
    logprobs = -np.log(A2[np.arange(m), Y])
    loss = np.sum(logprobs) / m
    return loss

# Backpropagation
def backward_propagation(X, Y, Z1, A1, Z2, A2, W2):
    m = len(Y)
    dZ2 = A2.copy()
    dZ2[np.arange(m), Y] -= 1
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = np.dot(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m
    return dW1, db1, dW2, db2

# Update parameters
def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2

# Predict function
def predict(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_propagation(X, W1, b1, W2, b2)
    predictions = np.argmax(A2, axis=1)
    return predictions

# Neural network model
def neural_network(X_train, y_train, X_test, y_test, hidden_size, num_iterations, learning_rate, print_interval):
    input_size = X_train.shape[1]
    output_size = 10
    W1, b1, W2, b2 = initialize_weights(input_size, hidden_size, output_size)
    Y_train = one_hot_encode(y_train, output_size)

    for i in range(num_iterations):
        Z1, A1, Z2, A2 = forward_propagation(X_train, W1, b1, W2, b2)
        loss = compute_loss(A2, y_train)
        dW1, db1, dW2, db2 = backward_propagation(X_train, y_train, Z1, A1, Z2, A2, W2)
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)

        if i % print_interval == 0:
            train_predictions = predict(X_train, W1, b1, W2, b2)
            train_accuracy = np.mean(train_predictions == y_train) * 100
            print(f'Iteration {i}: Loss = {loss:.4f}, Train Accuracy = {train_accuracy:.2f}%')

    train_predictions = predict(X_train, W1, b1, W2, b2)
    test_predictions = predict(X_test, W1, b1, W2, b2)
    train_accuracy = np.mean(train_predictions == y_train) * 100
    test_accuracy = np.mean(test_predictions == y_test) * 100
    return train_accuracy, test_accuracy

train_path = 'mnist_train.csv'
test_path = 'mnist_test.csv'

hidden_size = 128
num_iterations = 1000
learning_rate = 0.01
print_interval = 100

X_train, y_train, X_test, y_test = load_data(train_path, test_path)

train_accuracy, test_accuracy = neural_network(X_train, y_train, X_test, y_test, hidden_size, num_iterations, learning_rate, print_interval)

print(f'Train Accuracy: {train_accuracy:.2f}%')
print(f'Test Accuracy: {test_accuracy:.2f}%')


Iteration 0: Loss = 2.4432, Train Accuracy = 7.02%
Iteration 100: Loss = 1.6643, Train Accuracy = 62.47%
Iteration 200: Loss = 1.1939, Train Accuracy = 77.49%
Iteration 300: Loss = 0.9195, Train Accuracy = 81.77%
Iteration 400: Loss = 0.7631, Train Accuracy = 83.76%
Iteration 500: Loss = 0.6666, Train Accuracy = 84.94%
Iteration 600: Loss = 0.6020, Train Accuracy = 85.84%
Iteration 700: Loss = 0.5560, Train Accuracy = 86.52%
Iteration 800: Loss = 0.5215, Train Accuracy = 87.06%
Iteration 900: Loss = 0.4945, Train Accuracy = 87.46%
Train Accuracy: 87.81%
Test Accuracy: 88.43%
