Implement batch gradient descent with early stopping for softmax regression without using Scikit-Learn, only NumPy. Use it on a classification task such as the iris dataset

In [18]:
import numpy as np
from copy import deepcopy

epsilon = 1e-5

def to_one_hot(y):
    return np.diag(np.ones(y.max() + 1))[y]

class SoftMaxRegression:
    def softmax(self, logits):
        exps = np.exp(logits)
        exp_sums = exps.sum(axis=1, keepdims=True)
        return exps / exp_sums

    def fit(self, X_train, y_train, X_valid, y_valid, C=100, eta=0.5, n_epochs=5001):
        m = len(X_train)
        best_loss = np.infty

        y_train_one_hot = to_one_hot(y_train)
        y_valid_one_hot = to_one_hot(y_valid)

        n_inputs, n_outputs = X_train.shape[1], len(np.unique(y_train))

        np.random.seed(42)
        self.Theta = np.random.randn(n_inputs, n_outputs)

        for epoch in range(n_epochs):
            logits = X_train @ self.Theta
            y_proba = self.softmax(logits) 
            y_proba_valid = self.softmax(X_valid @ self.Theta)
            xentropy_losses = self.xentropy_losses(y_valid_one_hot, y_proba_valid)
            l2_loss = 1 / 2 *(self.Theta[1:] ** 2).sum()
            total_loss = xentropy_losses.sum(axis=1).mean() + 1 / C * l2_loss

            if epoch % 1000 == 0:
                print(epoch, total_loss.round(4))
            if total_loss < best_loss:
                self.model_params = self.Theta
                best_loss = total_loss
            # else:
            #     print(epoch - 1, best_loss.round(4))
            #     print(epoch, total_loss.round(4), "early stopping!")
            #     break

            error = y_proba - y_train_one_hot
            gradients = 1 / m * X_train.T @ error
            gradients += np.r_[np.zeros([1, n_outputs]), 1 / C * self.Theta[1:]]
            self.Theta = self.Theta - eta * gradients

    def xentropy_losses(self, y_valid_one_hot, y_proba_valid):
        return -(y_valid_one_hot * np.log(y_proba_valid + epsilon))

    def accuracy(self, X_valid, y_valid):
        # logits = X_valid @ self.Theta
        logits = X_valid @ self.model_params
        y_proba = self.softmax(logits)
        y_predict = y_proba.argmax(axis=1)

        accuracy_score = (y_predict == y_valid).mean()
        return accuracy_score

In [2]:
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)

X = iris.data[["petal length (cm)", "petal width (cm)"]].values
y = iris["target"].values

In [3]:
X_with_bias = np.c_[np.ones(len(X)), X]

In [4]:
test_ratio = 0.2
validation_ratio = 0.2
total_size = len(X_with_bias)

test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

np.random.seed(42)
rnd_indices = np.random.permutation(total_size)

X_train = X_with_bias[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X_with_bias[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X_with_bias[rnd_indices[test_size:]]
y_test = y[rnd_indices[test_size:]]

In [19]:
softmax_reg = SoftMaxRegression()
softmax_reg.fit(X_train, y_train, X_valid, y_valid)

0 7.6181
1000 0.3435
2000 0.2753
3000 0.2746
4000 0.2743
5000 0.2741


In [20]:
accuracy_score = softmax_reg.accuracy(X_test, y_test)
accuracy_score

0.95