In [1]:
import pickle
import os
import pandas as pd
import numpy as np


In [2]:
train_file = "/kaggle/input/fii-nn-2025-homework-3/extended_mnist_train.pkl"
test_file = "/kaggle/input/fii-nn-2025-homework-3/extended_mnist_test.pkl"

with open(train_file, "rb") as fp:
    train = pickle.load(fp)

with open(test_file, "rb") as fp:
    test = pickle.load(fp)

In [3]:
train_data = []
train_labels = []
for image, label in train:
    train_data.append(image.flatten())
    train_labels.append(label)

train_data = np.array(train_data, dtype=np.float32)
train_labels = np.array(train_labels)


In [4]:
test_data = []
for image, label in test:
    test_data.append(image.flatten())

test_data = np.array(test_data, dtype=np.float32)

In [5]:
class MLP:
    def __init__(self, input_size=784, hidden_size=100, output_size=10, 
                 learning_rate=0.01, dropout_rate=0.0):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.dropout_rate = dropout_rate
        
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
        
        self.z1 = None
        self.a1 = None
        self.z2 = None
        self.a2 = None
        self.dropout_mask = None
        
    def relu(self, z):
        return np.maximum(0, z)
    
    def relu_derivative(self, z):
        return (z > 0).astype(float)
    
    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
    def forward(self, X, training=True):
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.relu(self.z1)
        
        if training and self.dropout_rate > 0:
            self.dropout_mask = (np.random.rand(*self.a1.shape) > self.dropout_rate).astype(float)
            self.a1 *= self.dropout_mask
            self.a1 /= (1 - self.dropout_rate)
        
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.softmax(self.z2)
        
        return self.a2
    
    def backward(self, X, y, output):
        batch_size = X.shape[0]
        
        dz2 = output - y
        dW2 = np.dot(self.a1.T, dz2) / batch_size
        db2 = np.sum(dz2, axis=0, keepdims=True) / batch_size
        
        da1 = np.dot(dz2, self.W2.T)
        
        if self.dropout_rate > 0 and self.dropout_mask is not None:
            da1 *= self.dropout_mask
            da1 /= (1 - self.dropout_rate)
        
        dz1 = da1 * self.relu_derivative(self.z1)
        dW1 = np.dot(X.T, dz1) / batch_size
        db1 = np.sum(dz1, axis=0, keepdims=True) / batch_size
        
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2
    
    def compute_loss(self, y_true, y_pred):
        batch_size = y_true.shape[0]
        log_likelihood = -np.log(y_pred[np.arange(batch_size), np.argmax(y_true, axis=1)] + 1e-7)
        loss = np.mean(log_likelihood)
        return loss
    
    def compute_accuracy(self, y_true, y_pred):
        predictions = np.argmax(y_pred, axis=1)
        true_labels = np.argmax(y_true, axis=1)
        return np.mean(predictions == true_labels)
    
    def fit(self, X_train, y_train, X_val=None, y_val=None, 
            epochs=50, batch_size=128, verbose=True):
        n_samples = X_train.shape[0]
        history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
        
        for epoch in range(epochs):
            indices = np.random.permutation(n_samples)
            X_shuffled = X_train[indices]
            y_shuffled = y_train[indices]
            
            for i in range(0, n_samples, batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]
                
                output = self.forward(X_batch, training=True)
                self.backward(X_batch, y_batch, output)
            
            train_output = self.forward(X_train, training=False)
            train_loss = self.compute_loss(y_train, train_output)
            train_acc = self.compute_accuracy(y_train, train_output)
            
            history['train_loss'].append(train_loss)
            history['train_acc'].append(train_acc)
            
            if X_val is not None and y_val is not None:
                val_output = self.forward(X_val, training=False)
                val_loss = self.compute_loss(y_val, val_output)
                val_acc = self.compute_accuracy(y_val, val_output)
                history['val_loss'].append(val_loss)
                history['val_acc'].append(val_acc)
                
                if verbose and (epoch + 1) % 10 == 0:
                    print(f"Epoch {epoch+1}/{epochs} - "
                          f"Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, "
                          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
            else:
                if verbose and (epoch + 1) % 10 == 0:
                    print(f"Epoch {epoch+1}/{epochs} - "
                          f"Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
        
        return history
    
    def predict(self, X):
        output = self.forward(X, training=False)
        return np.argmax(output, axis=1)

In [6]:
train_data = train_data / 255.0
test_data = test_data / 255.0

mean = train_data.mean()
std = train_data.std() + 1e-7
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

train_labels_onehot = np.eye(10)[train_labels]

np.random.seed(42)
indices = np.random.permutation(len(train_data))
split_idx = int(0.9 * len(train_data))
train_idx = indices[:split_idx]
val_idx = indices[split_idx:]

X_train = train_data[train_idx]
y_train = train_labels_onehot[train_idx]
X_val = train_data[val_idx]
y_val = train_labels_onehot[val_idx]

print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Test samples: {test_data.shape[0]}")

Training samples: 54000
Validation samples: 6000
Test samples: 10000


In [7]:
np.random.seed(42)

mlp = MLP(
    input_size=784,
    hidden_size=100,
    output_size=10,
    learning_rate=0.1,
    dropout_rate=0.3
)

print("\nTraining MLP with Backpropagation + DROPOUT regularization")

history = mlp.fit(
    X_train, y_train,
    X_val, y_val,
    epochs=150,
    batch_size=128,
    verbose=True
)

print(f"\nFinal Training Accuracy: {history['train_acc'][-1]:.4f}")
print(f"\nFinal Validation Accuracy: {history['val_acc'][-1]:.4f}")
print("\nRegularization technique: DROPOUT (rate=0.4)")



Training MLP with Backpropagation + DROPOUT regularization
Epoch 10/150 - Loss: 0.0532, Acc: 0.9841, Val Loss: 0.0904, Val Acc: 0.9717
Epoch 20/150 - Loss: 0.0303, Acc: 0.9913, Val Loss: 0.0824, Val Acc: 0.9762
Epoch 30/150 - Loss: 0.0193, Acc: 0.9950, Val Loss: 0.0846, Val Acc: 0.9773
Epoch 40/150 - Loss: 0.0129, Acc: 0.9970, Val Loss: 0.0854, Val Acc: 0.9773
Epoch 50/150 - Loss: 0.0100, Acc: 0.9977, Val Loss: 0.0884, Val Acc: 0.9778
Epoch 60/150 - Loss: 0.0075, Acc: 0.9988, Val Loss: 0.0895, Val Acc: 0.9780
Epoch 70/150 - Loss: 0.0063, Acc: 0.9990, Val Loss: 0.0897, Val Acc: 0.9778
Epoch 80/150 - Loss: 0.0057, Acc: 0.9990, Val Loss: 0.0935, Val Acc: 0.9777
Epoch 90/150 - Loss: 0.0044, Acc: 0.9994, Val Loss: 0.0934, Val Acc: 0.9785
Epoch 100/150 - Loss: 0.0034, Acc: 0.9994, Val Loss: 0.1008, Val Acc: 0.9777
Epoch 110/150 - Loss: 0.0036, Acc: 0.9995, Val Loss: 0.1058, Val Acc: 0.9768
Epoch 120/150 - Loss: 0.0027, Acc: 0.9998, Val Loss: 0.1014, Val Acc: 0.9780
Epoch 130/150 - Loss: 0.0

In [8]:
predictions = mlp.predict(test_data)

In [9]:
# This is how you prepare a submission for the competition
predictions_csv = {
    "ID": [],
    "target": [],
}

for i, label in enumerate(predictions):
    predictions_csv["ID"].append(i)
    predictions_csv["target"].append(label)

df = pd.DataFrame(predictions_csv)
df.to_csv("submission.csv", index=False)