In [1]:
import pickle
import os
import pandas as pd
import numpy as np

np.random.seed(42)

In [2]:
train_file = "/kaggle/input/fii-nn-2025-homework-3/extended_mnist_train.pkl"
test_file = "/kaggle/input/fii-nn-2025-homework-3/extended_mnist_test.pkl"

with open(train_file, "rb") as fp:
    train = pickle.load(fp)

with open(test_file, "rb") as fp:
    test = pickle.load(fp)

In [3]:
X_train_full = np.array([img.flatten() for img, label in train], dtype=np.float32) / 255.0
y_train_full = np.array([label for img, label in train], dtype=int)

split = int(0.8 * len(X_train_full))
train_data, val_data = X_train_full[:split], X_train_full[split:]
train_labels, val_labels = y_train_full[:split], y_train_full[split:]

In [4]:
test_data = np.array([img.flatten() for img, label in test], dtype=np.float32) / 255.0

In [5]:
class MLP:
    def __init__(self, input_size=784, hidden_size=100, output_size=10, reg_lambda=0.0005):
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2. / input_size)
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(1. / hidden_size)
        self.b2 = np.zeros((1, output_size))
        self.reg_lambda = reg_lambda

    def relu(self, Z):
        return np.maximum(0, Z)

    def softmax(self, Z):
        expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
        return expZ / np.sum(expZ, axis=1, keepdims=True)

    def forward(self, X):
        self.Z1 = X @ self.W1 + self.b1
        self.A1 = self.relu(self.Z1)
        self.Z2 = self.A1 @ self.W2 + self.b2
        self.A2 = self.softmax(self.Z2)
        return self.A2

    def backward(self, X, y_onehot, lr):
        m = X.shape[0]
        dZ2 = self.A2 - y_onehot
        dW2 = (1/m) * (self.A1.T @ dZ2) + (self.reg_lambda/m) * self.W2
        db2 = (1/m) * np.sum(dZ2, axis=0, keepdims=True)
        dA1 = dZ2 @ self.W2.T
        dZ1 = dA1 * (self.Z1 > 0).astype(float)
        dW1 = (1/m) * (X.T @ dZ1) + (self.reg_lambda/m) * self.W1
        db1 = (1/m) * np.sum(dZ1, axis=0, keepdims=True)
        self.W1 -= lr * dW1
        self.b1 -= lr * db1
        self.W2 -= lr * dW2
        self.b2 -= lr * db2

model = MLP()
EPOCHS = 100
BATCH_SIZE = 64
LR = 0.1

print("Training started...")

for epoch in range(EPOCHS):
    perm = np.random.permutation(train_data.shape[0])
    X_shuffled = train_data[perm]
    y_shuffled = train_labels[perm]

    train_loss = 0
    train_correct = 0
    num_batches = 0

    for i in range(0, train_data.shape[0], BATCH_SIZE):
        X_batch = X_shuffled[i:i+BATCH_SIZE]
        y_batch_labels = y_shuffled[i:i+BATCH_SIZE]
        y_batch_onehot = np.eye(10)[y_batch_labels]

        A2 = model.forward(X_batch)
        batch_loss = -np.mean(np.sum(y_batch_onehot * np.log(A2 + 1e-8), axis=1))
        train_loss += batch_loss

        preds = np.argmax(A2, axis=1)
        train_correct += np.sum(preds == y_batch_labels)

        model.backward(X_batch, y_batch_onehot, LR)
        num_batches += 1

    train_loss /= num_batches
    train_accuracy = train_correct / train_data.shape[0]

    if (epoch + 1) % 10 == 0:
        A2_val = model.forward(val_data)
        y_val_onehot = np.eye(10)[val_labels]

        val_loss = -np.mean(np.sum(y_val_onehot * np.log(A2_val + 1e-8), axis=1))
        val_preds = np.argmax(A2_val, axis=1)
        val_accuracy = np.mean(val_preds == val_labels)

        print(f"Epoch {epoch+1}/{EPOCHS} | "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

Training started...
Epoch 10/100 | Train Loss: 0.0700, Train Acc: 0.9807 | Val Loss: 0.0966, Val Acc: 0.9718
Epoch 20/100 | Train Loss: 0.0316, Train Acc: 0.9927 | Val Loss: 0.0839, Val Acc: 0.9748
Epoch 30/100 | Train Loss: 0.0165, Train Acc: 0.9973 | Val Loss: 0.0835, Val Acc: 0.9759
Epoch 40/100 | Train Loss: 0.0094, Train Acc: 0.9993 | Val Loss: 0.0820, Val Acc: 0.9770
Epoch 50/100 | Train Loss: 0.0062, Train Acc: 0.9998 | Val Loss: 0.0841, Val Acc: 0.9774
Epoch 60/100 | Train Loss: 0.0045, Train Acc: 0.9999 | Val Loss: 0.0844, Val Acc: 0.9773
Epoch 70/100 | Train Loss: 0.0034, Train Acc: 1.0000 | Val Loss: 0.0858, Val Acc: 0.9776
Epoch 80/100 | Train Loss: 0.0028, Train Acc: 1.0000 | Val Loss: 0.0882, Val Acc: 0.9781
Epoch 90/100 | Train Loss: 0.0024, Train Acc: 1.0000 | Val Loss: 0.0888, Val Acc: 0.9778
Epoch 100/100 | Train Loss: 0.0021, Train Acc: 1.0000 | Val Loss: 0.0900, Val Acc: 0.9781


In [6]:
predictions = np.argmax(model.forward(test_data), axis=1)

In [7]:
# This is how you prepare a submission for the competition
predictions_csv = {
    "ID": [],
    "target": [],
}

for i, label in enumerate(predictions):
    predictions_csv["ID"].append(i)
    predictions_csv["target"].append(label)

df = pd.DataFrame(predictions_csv)
df.to_csv("submission.csv", index=False)