In [21]:
#This implementation demonstrates how Softmax multiclassification works for datasets with linearly separable classes more than 2.
#The implementation uses the load_iris dataset for the demonstration. 3 distinct classes are available here.

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
#Importing all the standard libraries + dataset.

In [22]:
data = load_iris()
X_raw = data.data
Y_raw = data.target
#Fetching the dataset.
Y = data.target.reshape(-1, 1)
#Converting Y into matrix form for further operations.

In [23]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)
#Scaling the X matrix for overflow prevention.
one = np.ones((X_scaled.shape[0], 1))
X = np.hstack([one, X_scaled])
#Adding the bias term.

In [24]:
print(X.shape)
print(Y.shape)
#Verifying the shapes.

(150, 5)
(150, 1)


In [25]:
Q = np.zeros((np.unique(Y.flatten()).size, X.shape[1]))
#Initialising the theta tensor with 0's.
def grad(pred_stat, X_i, Y_i):
    M1 = pred_stat.T
    main_term = M1 @ X_i
    #Using the predicted one-hot-vector and target one-hot-vector, we try to produce the gradient. 
    main_term[[Y_i], :] -= X_i
    #Minimising the distance between the two.
    return main_term
#Returning the gradient.

In [28]:
j = 0
#Initialised epoch iterator at 0.
epochs = 10
#Arbitrarily set no. of epochs.
alpha = 0.001
#Arbitrarily set learning rate.
while(j < epochs):
    #While loop start.
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    #Shuffling the X matrix everytime we go through an epoch for improved accuracy.
    for i in indices:
        #For loop for iterating through all the inputs.
        X_i = X[[i], :]
        #Storing the ith input vector.
        logit_i = X_i @ Q.T
        #Creating the logit space for further operations.
        logit_i -= np.max(logit_i)
        #A small mathematical manipulation. This does not affect the result, which can be verified mathematically.
        pred_stat = np.exp(logit_i)/np.sum(np.exp(logit_i))
        #Getting our predicted one-hot-vector with probabilistic entries.
        Y_i = Y[i, 0]
        #Fetching the index of true output.
        Q -= alpha*grad(pred_stat, X_i, Y_i)
        #Basic updation rule.
    j += 1
    #Going to the next epoch.
    H = Q @ X.T
    #Making a linear function of X and theta for getting predicted values.
    preds = np.argmax(H, axis = 0)
    #Intuitively, maximum value indicates highest probability. No need to convert into [0, 1] space.
    preds = preds.reshape(-1, 1)
    #Converting into matrix form.
    correct = (preds == Y)
    #Making another matrix to check no. of correct predictions.
    accuracy = np.mean(correct) * 100
    print(accuracy)
    #Printing accuracy of predictions on overall dataset per epoch.
    alpha *= 0.95
    #Decay constant for gradient updation. Prevents noisiness each epoch.
#Implementing while loop to iterate through epochs.

84.66666666666667
84.66666666666667
84.66666666666667
84.66666666666667
85.33333333333334
84.66666666666667
84.66666666666667
84.66666666666667
84.66666666666667
84.66666666666667


# Overview:
This is a standard implementation of the softmax regression for classifying multiple classes using the cross-entropy function. Multiclassification problems are very much a part of real world problems, and this demonstration goes in depth with how the math works in it: all the code and basic implementation practices that are essential.
# Learning, mistakes, doubts:
1. Learned a vectorized C-loop command ("np.unique(Y).size") to note the total no. of unique classes possible. Crucial for making the one-hot-vector.
2. The first command requires flattening the matrix first. It only works with 1-D arrays.
3. This was a small suggestion by chatGPT, which I later verified mathematically as well. Subtracting the max entry of the logit space from all entries of that space prevents overflow without changing the mathematical result.
4. Learned about a new argmax command to fetch maximum element from matrices using C-loop vectorization.

Thank you for sticking around :)