#### **12. Implement batch gradient descent with early stopping for softmax regression without using Scikit-Learn, only NumPy. Use it on a classification task such as the iris dataset.**

In [2]:
import numpy as np

In [3]:
from sklearn.datasets import load_iris
iris = load_iris(as_frame=True)

In [4]:
X = iris.data.values
X.shape # 150 instances with 4 features each

(150, 4)

In [5]:
y = iris.target_names[iris.target]
print(f"y has {y.shape[0]} instances") 

y has 150 instances


In [6]:
# Splitting the data 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
(X_train[:, 1] - X_train[:, 1].mean()) / X_train[:, 1].std()

array([ 1.2864604 ,  2.43545215, -0.78172474,  0.8268637 ,  0.13746866,
        1.97585545,  1.51625875, -1.47111979, -0.55192639,  0.13746866,
       -0.09232969,  1.2864604 , -0.32212804, -0.09232969, -0.09232969,
        1.05666205, -0.55192639, -0.78172474,  0.8268637 ,  1.05666205,
        1.7460571 , -2.39031318, -0.78172474,  0.8268637 , -0.09232969,
        0.59706535, -0.78172474,  0.8268637 , -0.09232969,  1.7460571 ,
        0.367267  , -0.32212804, -1.24132144,  2.66525049, -0.09232969,
       -0.78172474, -1.93071649, -0.09232969,  0.8268637 , -1.70091814,
        1.97585545, -1.70091814, -0.78172474,  0.59706535,  0.367267  ,
       -1.47111979, -0.09232969,  0.13746866, -0.55192639,  0.8268637 ,
        1.05666205,  0.367267  , -1.24132144,  0.367267  , -0.78172474,
       -1.01152309, -0.09232969, -1.24132144, -0.32212804, -0.09232969,
       -1.93071649,  0.367267  ,  0.13746866,  1.05666205, -0.09232969,
       -1.24132144,  0.8268637 ,  0.8268637 , -0.09232969,  0.59

In [8]:
# Processing the training data
def standard_scale(np_array, col_idxs):
    standardized_columns = []
    for col_idx in col_idxs:
        standardized_columns.append(
            (np_array[:, col_idx] - np_array[:, col_idx].mean()) / np_array[:, col_idx].std()
        )
    return np.column_stack(standardized_columns)


In [9]:
X_train = standard_scale(X_train, [0, 1, 2, 3])
X_train.shape

(112, 4)

In [10]:
# Checking the unique values (categories)
print(f'Unique values in train set: {np.unique(y_train)}')
print(f'Unique values in train set: {np.unique(y_test)}')

# Replacing each value with a numeric value so that we can calculate the gradients
y_train_num = []
for i in range(len(y_train)):
    if y_train[i] == 'setosa':
        y_train_num.append(0)
    elif y_train[i] == 'versicolor':
        y_train_num.append(1)
    else:
        y_train_num.append(2)

# Just testing a few
for i in range(0, 5):
    print(f'Index: {i}')
    print(f'Actual category: {y_train[i]}')
    print(f'Numeric category: {y_train_num[i]}')
    print('*'*50)

y_train_num = np.array(y_train_num).reshape(-1, 1)

Unique values in train set: ['setosa' 'versicolor' 'virginica']
Unique values in train set: ['setosa' 'versicolor' 'virginica']
Index: 0
Actual category: setosa
Numeric category: 0
**************************************************
Index: 1
Actual category: setosa
Numeric category: 0
**************************************************
Index: 2
Actual category: virginica
Numeric category: 2
**************************************************
Index: 3
Actual category: versicolor
Numeric category: 1
**************************************************
Index: 4
Actual category: versicolor
Numeric category: 1
**************************************************


In [11]:
# Performing the same operation for y_test
# Replacing each value with a numeric value so that we can calculate the gradients
y_test_num = []
for i in range(len(y_test)):
    if y_test[i] == 'setosa':
        y_test_num.append(0)
    elif y_test[i] == 'versicolor':
        y_test_num.append(1)
    else:
        y_test_num.append(2)

y_test_num = np.array(y_test_num)

In [12]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train_num.shape}")
print('*'*50)
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test_num.shape}")

X_train shape: (112, 4)
y_train shape: (112, 1)
**************************************************
X_test shape: (38, 4)
y_test shape: (38,)


In [13]:
y_train_one_hot = []
for i in range(len(y_train_num)):
    if y_train_num[i] == 0:
        y_train_one_hot.append([1, 0, 0])
    elif y_train_num[i] == 1:
        y_train_one_hot.append([0, 1, 0])
    else:
        y_train_one_hot.append([0, 0, 1])

y_train_one_hot = np.array(y_train_one_hot)

In [14]:
y_train_one_hot.shape

(112, 3)

In [15]:
y_train_one_hot[0]

array([1, 0, 0])

In [16]:
# Each class should have a weight for each feature
# Since we have 3 classes, and 4 features
# We expect a 3 x 4 matrix (each class' weights for each feature is one row)

# So let us randomly initalize the weights
num_classes = len(np.unique(y_train_num))
num_features = X_train.shape[1]

print(f"Since we have {num_classes} classes, and {num_features} features, \
our weight matrix will have shape {num_classes} x {num_features}.")

# Initialize random weights with a value for each feature for each class
weight_matrix = np.random.randn(num_classes, num_features)
print(weight_matrix.shape)

# Logit (raw score) of the first instance being in class 0 with random weights
weight_matrix[0, :].T @ X_train[1]

Since we have 3 classes, and 4 features, our weight matrix will have shape 3 x 4.
(3, 4)


-4.492374407833047

In [17]:
# About to run batch gradient descent on ONE instance
num_epochs = 1000
learning_rate = 0.02
num_instances = X_train.shape[0]

# For each instance, we want to calculate the probability that the instance is in each class
# Since we have 112 instances, and 3 possible classes,
# We will end up with a matrix with 112 rows and 3 columns
all_probs = []

for instance_idx in range(len(X_train)):

    # Step 1: Calculate the raw score of each instance being in each class
    raw_scores_vector = []
    for class_idx in range(len(weight_matrix)):
        class_weights_vector = weight_matrix[class_idx]
        class_raw_score = class_weights_vector.T @ X_train[instance_idx]

        raw_scores_vector.append(class_raw_score)
    raw_scores_vector = np.array(raw_scores_vector)

    # Step 2: Calculating a probability score for each class for each instance
    sum_raw_scores_exponents = np.exp(raw_scores_vector).sum()
    probabilities = []
    for idx in range(len(raw_scores_vector)):
        prob = np.exp(raw_scores_vector[idx]) / sum_raw_scores_exponents
        probabilities.append(prob)
    probabilities = np.array(probabilities)

    all_probs.append(probabilities)

all_probs = np.array(all_probs)
print(all_probs.shape)

(112, 3)


In [18]:
print(X_train.shape)

(112, 4)


In [22]:
# So now we have a matrix where each row is an instance
# And each column is that instance's probability of being in that class

# Now we want to calculate the gradients
# Let us use class 0

# For each instance, the probability that it is in class 0
for class_idx in range(all_probs.shape[1]):
    print(f"--Current Class: {class_idx}")

    curr_class_prob_vector = all_probs[:, class_idx]
    print(f"Class {class_idx} probability vector: {curr_class_prob_vector.shape}")

    # For each instance, whether or not it is in class 0
    curr_class_y_vector = y_train_one_hot[:, class_idx]
    print(f"Class {class_idx} label vector: {curr_class_y_vector.shape}")

    gradient_for_each_instance = []
    for instance_idx in range(len(X_train)):
        pred_error = curr_class_prob_vector[instance_idx] - curr_class_y_vector[instance_idx]
        print(pred_error)

        print(X_train[instance_idx])
        
        pred_error_by_curr_instance = pred_error * X_train[instance_idx]
        print(pred_error_by_curr_instance)
        # gradient = (class_0_weight_vector[instance_idx] - y_train[instance_idx]) @ X_train[instance_idx]


--Current Class: 0
Class 0 probability vector: (112,)
Class 0 label vector: (112,)
-0.977400402719172
[-1.01827123  1.2864604  -1.39338902 -1.3621769 ]
[ 0.99525871 -1.25738691  1.36189899  1.33139225]
-0.9694439236159407
[-0.7730102   2.43545215 -1.33550342 -1.49647603]
[ 0.74939005 -2.36103428  1.29469568  1.45074959]
0.492814986172886
[-0.03722712 -0.78172474  0.74837808  0.92090833]
[-0.01834608 -0.38524567  0.36881193  0.45383742]
0.411695736627649
[0.20803391 0.8268637  0.4010645  0.51801093]
[0.08564667 0.34041626 0.16511654 0.21326289]
0.7248674663661906
[1.06644751 0.13746866 0.51683569 0.3837118 ]
[0.7730331  0.09964656 0.37463738 0.2781402 ]
-0.9504543644882061
[-0.52774918  1.97585545 -1.45127462 -1.09357864]
[ 0.50160151 -1.87796043  1.3793703   1.03939659]
-0.9447057657432235
[-0.52774918  1.51625875 -1.33550342 -1.3621769 ]
[ 0.49856769 -1.43241838  1.26165778  1.28685637]
0.19590557156402694
[-0.40511866 -1.47111979 -0.06202028 -0.28778385]
[-0.079365   -0.28820056 -0.0

In [20]:
np.exp(np.array([1, 2, 3])).sum()

30.19287485057736

In [21]:
# Implementing batch gradient descent
learning_rate = 0.02
num_epochs = 1000
num_instances = X_train.shape[0]

np.random.seed(42)
# Randomly initialize weights for each feature
theta = np.random.randn(X_train.shape[1], 1)
print(f"Initial Weights: {theta}\n")

# 
for epoch in range(num_epochs):
    gradients = 2 / num_instances * X_train.T @ (X_train @ theta - y_train_num)
    theta = theta - learning_rate * gradients

# Final theta
print(f"Final theta: {theta}")

Initial Weights: [[ 0.49671415]
 [-0.1382643 ]
 [ 0.64768854]
 [ 1.52302986]]

Final theta: [[-0.02153836]
 [-0.05733346]
 [ 0.24250165]
 [ 0.53404859]]
