# Task: Implement Batch Gradient Descent with early stopping for Softmax Regression (without using Scikit-Learn).

In [13]:
from sklearn import datasets
iris = datasets.load_iris()
list(iris.keys())

['data',
 'target',
 'frame',
 'target_names',
 'DESCR',
 'feature_names',
 'filename',
 'data_module']

In [19]:
import numpy as np


X = iris['data'][:, (2, 3)]   # petal length, petal width
X = np.c_[np.ones([len(X), 1]), X]  # add bias term
y = iris['target']

In [21]:
np.random.seed(2042)

test_ratio = 0.2
validation_ratio = 0.2
total_size = len(y)

test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

rnd_indices = np.random.permutation(total_size)

X_train = X[rnd_indices[:train_size]]
X_valid = X[rnd_indices[train_size:-test_size]]
X_test = X[rnd_indices[-test_size:]]
y_train = y[rnd_indices[:train_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
y_test = y[rnd_indices[-test_size:]]


In [41]:
def one_hot_encoding(y):
    n_classes = y.max() + 1
    m = len(y)
    Y_one_hot = np.zeros((m, n_classes))
    Y_one_hot[np.arange(m), y] = 1
    return Y_one_hot


def train(X_train, y_train, X_valid, y_valid):
    y_train_encoded = one_hot_encoding(y_train)
    y_valid_encoded = one_hot_encoding(y_valid)
    theta = gradient_descent(X_train, y_train_encoded, X_valid, y_valid_encoded)
    print()
    print(theta, end='\n\n')
    return theta


def gradient_descent(X_train, y_train, X_valid, y_valid, eta=0.1,
                     n_iterations=5000, epsilon=1e-7, alpha=0.1):
    m = X_train.shape[0]  # number of rows
    output_number = len(np.unique(y_train, axis=0))
    input_number = X_train.shape[1]
    theta = np.random.randn(input_number, output_number)
    best_loss = np.infty

    for iteration in range(1, n_iterations+1):
        z = X_train.dot(theta)
        y_train_prob = softmax(z)
        error = y_train_prob - y_train
        gradients = 1/m * X_train.T.dot(error)
        theta -= eta * gradients

        valid_z = X_valid.dot(theta)
        y_valid_prob = softmax(valid_z)
        # logarithm will alway_trains be negative, so we add -
        cross_entropy_train_loss = -np.mean(
            np.sum(y_valid * np.log(y_valid_prob + epsilon), axis=1)
            )
        l2_loss = 1/2 * np.sum(np.square(theta[1:]))
        loss = cross_entropy_train_loss + alpha * l2_loss

        if iteration % 500 == 0:
            print(iteration, loss)
        if loss < best_loss:
            best_loss = loss
        else:
            print(iteration-1, best_loss)
            print(iteration, loss, "early stopping!")
            break

    return theta


def softmax(z):
    y_prob = np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
    return y_prob


theta = train(X_train, y_train, X_valid, y_valid)


220 1.2509798944267723
221 1.2509816824306497 early stopping!

[[ 1.37621026 -0.3677057  -2.15691583]
 [ 0.4160048   0.08952304  0.36678774]
 [-1.32032754  2.27451352  2.68778647]]



In [42]:
logits = X_valid.dot(theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.9

In [43]:
logits = X_test.dot(theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_test)
accuracy_score

0.9