#### **12. Implement batch gradient descent with early stopping for softmax regression without using Scikit-Learn, only NumPy. Use it on a classification task such as the iris dataset.**

In [131]:
import numpy as np

In [132]:
from sklearn.datasets import load_iris
iris = load_iris(as_frame=True)

In [133]:
X = iris.data.values
X.shape # 150 instances with 4 features each

(150, 4)

In [134]:
y = iris.target_names[iris.target]
print(f"y has {y.shape[0]} instances") 

y has 150 instances


In [135]:
# Splitting the data 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [136]:
(X_train[:, 1] - X_train[:, 1].mean()) / X_train[:, 1].std()

array([ 1.2864604 ,  2.43545215, -0.78172474,  0.8268637 ,  0.13746866,
        1.97585545,  1.51625875, -1.47111979, -0.55192639,  0.13746866,
       -0.09232969,  1.2864604 , -0.32212804, -0.09232969, -0.09232969,
        1.05666205, -0.55192639, -0.78172474,  0.8268637 ,  1.05666205,
        1.7460571 , -2.39031318, -0.78172474,  0.8268637 , -0.09232969,
        0.59706535, -0.78172474,  0.8268637 , -0.09232969,  1.7460571 ,
        0.367267  , -0.32212804, -1.24132144,  2.66525049, -0.09232969,
       -0.78172474, -1.93071649, -0.09232969,  0.8268637 , -1.70091814,
        1.97585545, -1.70091814, -0.78172474,  0.59706535,  0.367267  ,
       -1.47111979, -0.09232969,  0.13746866, -0.55192639,  0.8268637 ,
        1.05666205,  0.367267  , -1.24132144,  0.367267  , -0.78172474,
       -1.01152309, -0.09232969, -1.24132144, -0.32212804, -0.09232969,
       -1.93071649,  0.367267  ,  0.13746866,  1.05666205, -0.09232969,
       -1.24132144,  0.8268637 ,  0.8268637 , -0.09232969,  0.59

In [137]:
# Processing the training data
def standard_scale(np_array, col_idxs):
    standardized_columns = []
    for col_idx in col_idxs:
        standardized_columns.append(
            (np_array[:, col_idx] - np_array[:, col_idx].mean()) / np_array[:, col_idx].std()
        )
    return np.column_stack(standardized_columns)


In [138]:
X_train = standard_scale(X_train, [0, 1, 2, 3])
X_train.shape

(112, 4)

In [139]:
# Checking the unique values (categories)
print(f'Unique values in train set: {np.unique(y_train)}')
print(f'Unique values in train set: {np.unique(y_test)}')

# Replacing each value with a numeric value so that we can calculate the gradients
y_train_num = []
for i in range(len(y_train)):
    if y_train[i] == 'setosa':
        y_train_num.append(0)
    elif y_train[i] == 'versicolor':
        y_train_num.append(1)
    else:
        y_train_num.append(2)

# Just testing a few
for i in range(0, 5):
    print(f'Index: {i}')
    print(f'Actual category: {y_train[i]}')
    print(f'Numeric category: {y_train_num[i]}')
    print('*'*50)

y_train_num = np.array(y_train_num).reshape(-1, 1)

Unique values in train set: ['setosa' 'versicolor' 'virginica']
Unique values in train set: ['setosa' 'versicolor' 'virginica']
Index: 0
Actual category: setosa
Numeric category: 0
**************************************************
Index: 1
Actual category: setosa
Numeric category: 0
**************************************************
Index: 2
Actual category: virginica
Numeric category: 2
**************************************************
Index: 3
Actual category: versicolor
Numeric category: 1
**************************************************
Index: 4
Actual category: versicolor
Numeric category: 1
**************************************************


In [140]:
# Performing the same operation for y_test
# Replacing each value with a numeric value so that we can calculate the gradients
y_test_num = []
for i in range(len(y_test)):
    if y_test[i] == 'setosa':
        y_test_num.append(0)
    elif y_test[i] == 'versicolor':
        y_test_num.append(1)
    else:
        y_test_num.append(2)

y_test_num = np.array(y_test_num)

In [141]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train_num.shape}")
print('*'*50)
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test_num.shape}")

X_train shape: (112, 4)
y_train shape: (112, 1)
**************************************************
X_test shape: (38, 4)
y_test shape: (38,)


In [151]:
# Each class should have a weight for each feature
# Since we have 3 classes, and 4 features
# We expect a 3 x 4 matrix (each class' weights for each feature is one row)

# So let us randomly initalize the weights
num_classes = len(np.unique(y_train_num))
num_features = X_train.shape[1]

print(f"Since we have {num_classes} classes, and {num_features} features, \
our weight matrix will have shape {num_classes} x {num_features}.")

theta = np.random.randn(num_classes, num_features)
print(theta.shape)

# Probability that the first instance is in class 0 with random weights
theta[0, :].T @ X_train[0]

Since we have 3 classes, and 4 features, our weight matrix will have shape 3 x 4.
(3, 4)


2.1062633607327426

In [142]:
# Implementing batch gradient descent
learning_rate = 0.02
num_epochs = 1000
num_instances = X_train.shape[0]

np.random.seed(42)
# Randomly initialize weights for each feature
theta = np.random.randn(X_train.shape[1], 1)
print(f"Initial Weights: {theta}\n")

# 
for epoch in range(num_epochs):
    gradients = 2 / num_instances * X_train.T @ (X_train @ theta - y_train_num)
    theta = theta - learning_rate * gradients

# Final theta
print(f"Final theta: {theta}")

Initial Weights: [[ 0.49671415]
 [-0.1382643 ]
 [ 0.64768854]
 [ 1.52302986]]

Final theta: [[-0.02153836]
 [-0.05733346]
 [ 0.24250165]
 [ 0.53404859]]
