In [1]:
from sklearn import datasets
iris=datasets.load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [2]:
X=iris["data"][:,(2,3)]
y=iris["target"]

In [4]:
import numpy as np

X_with_bias=np.c_[np.ones([len(X),1]),X]
X_with_bias[:5]

array([[1. , 1.4, 0.2],
       [1. , 1.4, 0.2],
       [1. , 1.3, 0.2],
       [1. , 1.5, 0.2],
       [1. , 1.4, 0.2]])

In [5]:
np.random.seed(2042)

In [20]:
test_ratio=0.2
val_ratio=0.2
total_size=len(X_with_bias)

test_size=int(total_size*test_ratio)
val_size=int(total_size*val_ratio)
train_size=total_size-test_size-val_size


rnd_indices=np.random.permutation(total_size)

X_train=X_with_bias[rnd_indices[:train_size]]
y_train=y[rnd_indices[:train_size]]
X_val=X_with_bias[rnd_indices[train_size:-test_size]]
y_val=y[rnd_indices[train_size:-test_size]]
X_test=X_with_bias[rnd_indices[-test_size:]]
y_test=y[rnd_indices[-test_size:]]

In [21]:
def to_one_hot(y):
    n_classes=y.max()+1
    m=len(y)
    y_one_hot=np.zeros((m,n_classes))
    y_one_hot[np.arange(m),y]=1
    return y_one_hot

In [22]:
y_train[:5]

array([2, 0, 2, 1, 0])

In [23]:
to_one_hot(y_train[:5])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [24]:
y_train_one_hot=to_one_hot(y_train)
y_val_one_hot=to_one_hot(y_val)
y_test_one_hot=to_one_hot(y_test)

In [25]:
def softmax(logits):
    exps=np.exp(logits)
    exp_sums=np.sum(exps, axis=1, keepdims=True)
    return exps/exp_sums

In [30]:
n_inputs=X_train.shape[1]
n_outputs=len(np.unique(y_train))

In [33]:
eta=0.01
n_iterations=5001
m=len(X_train)
epsilon=1e-7

Theta=np.random.randn(n_inputs, n_outputs)

for i in range(n_iterations):
    logits=X_train.dot(Theta)
    Y_proba=softmax(logits)
    if i%500==0:
        loss=-np.mean(np.sum(y_train_one_hot*np.log(Y_proba+epsilon), axis=1))
        print(i, loss)
    error=Y_proba-y_train_one_hot
    gradients=1/m*X_train.T.dot(error)
    Theta=Theta-eta*gradients

0 5.8285220987930275
500 0.8628788630980642
1000 0.7172370543817995
1500 0.6289979536153374
2000 0.5697794309834703
2500 0.5268651395466324
3000 0.49399474427793827
3500 0.46777295712811817
4000 0.4462025770226584
4500 0.4280295539675276
5000 0.41242421799794043


In [34]:
Theta

array([[ 3.57713219,  0.06182778, -1.89934743],
       [-0.75700209,  0.12093512, -0.20704129],
       [-1.5014538 ,  0.30932909,  2.51880068]])

In [36]:
logits=X_val.dot(Theta)
Y_proba=softmax(logits)
y_predict=np.argmax(Y_proba, axis=1)
accuracy_score=np.mean(y_predict==y_val)
accuracy_score

np.float64(0.9333333333333333)

In [40]:
eta = 0.1
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7

alpha=0.1
Theta = np.random.randn(n_inputs, n_outputs)
for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    if iteration % 500 == 0:
        xentropy_loss = -np.mean(np.sum(y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
        l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
        loss = xentropy_loss + alpha * l2_loss
        print(iteration, loss)
    error = Y_proba - y_train_one_hot
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients


0 1.4750733530361029
500 0.5618325536773807
1000 0.5320755816364335
1500 0.523097987174734
2000 0.5197147632023196
2500 0.5183284687639929
3000 0.517735720367923
3500 0.5174760000568017
4000 0.5173604860049751
4500 0.5173086202520893
5000 0.5172851889756602


In [41]:
Theta

array([[ 4.33429419,  0.58305694, -4.67944235],
       [-1.10068448,  0.15013305,  0.95055143],
       [-0.43271266, -0.17995345,  0.61266611]])

In [43]:
logits=X_val.dot(Theta)
Y_proba=softmax(logits)
y_predict=np.argmax(Y_proba, axis=1)

accuracy_score=np.mean(y_predict==y_val)
accuracy_score

np.float64(0.9666666666666667)

In [47]:
eta = 0.1 
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7
alpha = 0.1  # regularization hyperparameter

best_loss=np.inf

Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    error = Y_proba - y_train_one_hot
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients

    logits = X_val.dot(Theta)
    Y_proba = softmax(logits)
    xentropy_loss = -np.mean(np.sum(y_val_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss

    if iteration%500==0:
        print(iteration, loss)
    if loss<best_loss:
        best_loss=loss
    else:
        print(iteration-1, best_loss)
        print(iteration, loss, "early stopping!")
        break


0 2.8044976338482397
500 0.5281458771399836
1000 0.4908330944716105
1500 0.4773759819370363
2000 0.4713377895085288
2500 0.4683295073739356
3000 0.46671426301535957
3500 0.4657949128564906
4000 0.46524713981272414
4500 0.4649090417727526
5000 0.4646947548072835


In [48]:
Theta

array([[ 3.91008629,  0.15879653, -5.10457609],
       [-1.10074682,  0.1500831 ,  0.95066372],
       [-0.43272786, -0.17996039,  0.61268825]])

In [50]:
logits=X_val.dot(Theta)
Y_proba=softmax(logits)
y_predict=np.argmax(Y_proba, axis=1)

accuracy_score=np.mean(y_predict==y_val)
accuracy_score

np.float64(0.9666666666666667)

In [None]:
logits=X_test.dot(Theta)
y_proba=softmax(logits)
y_predict=np.