In [2]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense, Normalization
from tensorflow.keras.models import Sequential
from sklearn.datasets import make_blobs

In [3]:
def my_softmax(z):
    ez = np.exp(z)
    sm = ez/np.sum(ez)
    return sm

In [4]:
centers = [[-5, 2], [-2, -2], [1, 2], [5, -2]]
X_train, y_train = make_blobs(n_samples=2000, centers=centers, cluster_std=1.0, random_state=30)

In [5]:
X_train.shape

(2000, 2)

## Method 1 : The Obvious Organization

### Defining Model

In [6]:
model = Sequential(
    [
        Dense(units=25, activation='relu'),
        Dense(units=15, activation='relu'),
        Dense(units=4, activation='softmax')
    ]
)

### Calculating Losses

In [7]:
model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(0.001),
)

### Train Model

In [11]:
model.fit(
    X_train,
    y_train,
    epochs = 10
)

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 509us/step - loss: 1.0596 
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 407us/step - loss: 0.4107
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 407us/step - loss: 0.1850
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 387us/step - loss: 0.1099
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386us/step - loss: 0.0732
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 379us/step - loss: 0.0625
Epoch 7/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 392us/step - loss: 0.0476
Epoch 8/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 397us/step - loss: 0.0509
Epoch 9/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 375us/step - loss: 0.0454
Epoch 10/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 377us/step - l

<keras.src.callbacks.history.History at 0x16bd0ad20>

Because the softmax is integrated into the output layer, the output is a vector of probabilities

In [13]:
p_nonpreferred = model.predict(X_train)
print(p_nonpreferred)
print(f"largest value = {np.max(p_nonpreferred)}, smallest value = {np.min(p_nonpreferred)}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308us/step
[[4.6903500e-03 6.3831690e-03 9.7057939e-01 1.8347105e-02]
 [9.9541843e-01 4.3183039e-03 3.9081162e-05 2.2420721e-04]
 [9.6415263e-01 3.3804245e-02 5.8847514e-04 1.4546210e-03]
 ...
 [1.3928416e-03 9.9288809e-01 4.6160221e-03 1.1030221e-03]
 [6.1891609e-05 6.0748021e-05 3.5321962e-05 9.9984211e-01]
 [3.3076198e-03 4.1688266e-03 9.8625135e-01 6.2722987e-03]]
largest value = 0.9999996423721313, smallest value = 4.70775347594099e-08


## Method 2 : Preferred

### Define model

In [21]:
preferred_model = Sequential(
    [
        Dense(units=25, activation='relu'),
        Dense(units=15, activation='relu'),
        Dense(units=4, activation='linear'),
    ]
)

### Calculate losses

In [22]:
preferred_model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer = tf.keras.optimizers.Adam(0.001)
)

### Train model

In [23]:
model.fit(
    X_train,
    y_train,
    epochs = 10
)

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 746us/step - loss: 0.0187
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396us/step - loss: 0.0176
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 399us/step - loss: 0.0206
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 385us/step - loss: 0.0152
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391us/step - loss: 0.0176
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 393us/step - loss: 0.0141
Epoch 7/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381us/step - loss: 0.0170
Epoch 8/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 384us/step - loss: 0.0171
Epoch 9/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386us/step - loss: 0.0145
Epoch 10/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383us/step - lo

<keras.src.callbacks.history.History at 0x16c15a540>

Notice that in the preferred model, the outputs are not probabilities, but can range from large negative numbers to large positive numbers. The output must be sent through a softmax when performing a prediction that expects a probability. Let's look at the preferred model outputs

In [24]:
p_preferred = preferred_model.predict(X_train)
print(f"largest value = {np.max(p_preferred)}, smallest value = {np.min(p_preferred)}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 421us/step
largest value = 2.1341941356658936, smallest value = -1.109634518623352


In [25]:
sm_preferred = tf.nn.softmax(p_preferred).numpy()
print(f"largest value = {np.max(sm_preferred)}, smallest value = {np.min(sm_preferred)}")

largest value = 0.7528277039527893, smallest value = 0.031512174755334854


To select the most likely category, the softmax is not required. One can find the index of the largest output using np.argmax()

In [26]:
for i in range(5):
    print(f"{p_preferred[i]}, category: {np.argmax(p_preferred[i])}")

[-0.23924148  0.06863128 -0.04785182 -0.0232656 ], category: 1
[ 0.37471628 -0.33432016  1.2355483  -0.7394061 ], category: 2
[ 0.2893575  -0.2743155   0.93448514 -0.5638379 ], category: 2
[-0.29910663 -0.26310387  0.36898768  0.06318613], category: 2
[-0.3159943   0.02626276  0.35934338 -0.15024407], category: 2
