# Matrix and Vector Multiplication

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
matrix_a = np.array([[3, 4, 2]])
matrix_b = np.array([[13,9,7,15],[8,7,4,6], [6,4,0,3]])

In [None]:
np.matmul(matrix_a, matrix_b)

array([[83, 63, 37, 75]])

In [None]:
vector_a = [1, 2, 3]
vector_b = [2, 4, 6]

In [None]:
np.dot(vector_a, vector_b)

28

In [None]:
np.dot(np.transpose(vector_a), vector_b)

28

## Vectorized Parameter Update After Back-propagation
Based on this video:
Vectorizing Logistic Regression's Gradient Output

https://www.coursera.org/learn/neural-networks-deep-learning/lecture/IgFnJ/vectorizing-logistic-regressions-gradient-output

### Numpy Arrays vs. Mathematical Matrices
- When working with Numpy ND-arrays, it is common to place each example or sample as a visual "column" with each row or item in the column holding a feature value of the sample. Each "row" holds different samples' values for one feature X.

- Assume a parameter array W holding n_rows elements. Use W as the first matrix. Take the first and only row of W and multiply each element to the corresponding element in each column of X. In Numpy and in matrix multiplication, X has to be transposed if it is the first matrix. Matrix multiplication is not commutative.

- It was recommended to use n x 1 or 1 x n arrays instead of (n, ) and (,n)
arrays. There are some bugs that can be avoided that way.

In [None]:
# Simulated input with three features x1, x2, and x3.
# Five columns. Each column is an example with rows corresponding to x1, x2, and x3
X = np.array([
    [11, 21, 31, 41, 51],
    [12, 22, 32, 42, 52],
    [13, 23, 33, 43, 53],
])
# Simulated array of initialized parameters W to match each feature x.
W = np.array([[-1, 1, 1]])
W_flat = np.array([-1, 1, 1])
# Simulated array of bias values b.
b = -25
print(X)

[[11 21 31 41 51]
 [12 22 32 42 52]
 [13 23 33 43 53]]


In [None]:
X.shape

(3, 5)

In [None]:
df_X = pd.DataFrame(X)
df_X.head()

Unnamed: 0,0,1,2,3,4
0,11,21,31,41,51
1,12,22,32,42,52
2,13,23,33,43,53


In [None]:
df_X.shape

(3, 5)

In [None]:
print(W)

[[-1  1  1]]


In [None]:
W.shape

(1, 3)

In the samples below, W is a matrix of parameters w. Each row is an array of parameters set in one unit or neuron. Each column is an array of parameters corresponding to one feature.

Z is an array of z values. The z values are the result of applying the polynomial $w_1x_1 + w_2x_2 + w_3x_3 + b$

Here, the matrix/array of parameters W is multiplied with X. Again, this is NOT equivalent to X * W. Matrix multiplications are not commutative as a byproduct of matrix rules. W * X != X * W. However, W * X == X.T * W.

In [None]:
print(X.shape[0], '|', W.shape[1])
assert(X.shape[0] == W.shape[1])

3 | 3


In [None]:
print(X.shape[0], '|', W_flat.shape[0])
assert(X.shape[0] == W_flat.shape[0])

3 | 3


In [None]:
Z = np.dot(W, X) + b
print(Z)

[[-11  -1   9  19  29]]


In [None]:
# Check that Z's row count equals W's (first matrix) column count.
assert(Z.shape[0] == W.shape[0])
# Check that Z's column count equals X' (second matrix) row count.
assert(Z.shape[1] == X.shape[1])

In [None]:
Z = np.dot(W_flat, X) + b
print(Z)

[-11  -1   9  19  29]


In [None]:
print(X)

[[11 21 31 41 51]
 [12 22 32 42 52]
 [13 23 33 43 53]]


In [None]:
print(X.T)

[[11 12 13]
 [21 22 23]
 [31 32 33]
 [41 42 43]
 [51 52 53]]


In [None]:
Z = np.dot(X.T, W[0]) + b
print(Z)

[-11  -1   9  19  29]


In [None]:
# Alternative format of X with each row being a new example.
# Each column vector holds x values for x1 to x3.
# Each row is a different sample with x1...x3
X = np.array([
    [11, 12, 13],
    [21, 22, 23],
    [31, 32, 33],
    [41, 42, 43],
    [51, 52, 53]
])

In [None]:
df_X = pd.DataFrame(X)
df_X.head()

Unnamed: 0,0,1,2
0,11,12,13
1,21,22,23
2,31,32,33
3,41,42,43
4,51,52,53


In [None]:
df_X.shape

(5, 3)

In [None]:
print(X.T)

[[11 21 31 41 51]
 [12 22 32 42 52]
 [13 23 33 43 53]]


In [None]:
print(X)

[[11 12 13]
 [21 22 23]
 [31 32 33]
 [41 42 43]
 [51 52 53]]


In [None]:
print(W)

[[-1  1  1]]


In [None]:
print(W_flat)

[-1  1  1]


In [None]:
Z = np.dot(W, X.T) + b
print(Z)

[[-11  -1   9  19  29]]


In [None]:
Z = np.dot(W_flat, X.T) + b
print(Z)

[-11  -1   9  19  29]


In [None]:
Z = np.dot(X, W[0]) + b
print(Z)

[-11  -1   9  19  29]


In [None]:
Z = np.dot(X, W_flat) + b
print(Z)

[-11  -1   9  19  29]


In [None]:
Z = np.dot(X, W.T) + b
print(Z)

[[-11]
 [ -1]
 [  9]
 [ 19]
 [ 29]]


Check that the matrix meets the rules of matrix multiplication.
Multiplying an n1_row, m1_column matrix by n2_row, m2_column matrix should yield a matrix with (n1_row, m2_column) rows and columns.

In [None]:
# Check that Z's row count equals X' (first matrix) row count.
assert(Z.shape[0] == X.shape[0])
# Check that Z's column count equals transposed W's (second matrix) row count.
assert Z.shape[1] == W.T.shape[1], "Z column count mismatch with W.transpose column count."

### Find the Result of the Sigmoid Activation Function and Compare to Y

In [None]:
# Broadcasting demo. Here, b = -25
np.array([[1, 1, 1,], [2, 2, 2]]) + b

array([[-24, -24, -24],
       [-23, -23, -23]])

In [None]:
# Alternative computation of A using broadcasting.
A = np.exp(-Z)
A = A+1
A = 1/A
print(A)

[[1.67014218e-05]
 [2.68941421e-01]
 [9.99876605e-01]
 [9.99999994e-01]
 [1.00000000e+00]]


In [None]:
# More precise calculation of A by applying a function element-wise.
import math
def sigmoid(Z):
  sig = lambda z_item: 1 / (1 + math.exp(-z_item))
  result = []
  for item in Z:
    result.append(sig(item))
  return result

A = sigmoid(Z)
print(A)

[1.670142184809518e-05, 0.2689414213699951, 0.9998766054240137, 0.9999999943972036, 0.9999999999997455]


In [None]:
# Find the difference between true labels and predicted labels.
Y_true = np.array([0, 0, 1, 1, 1], dtype=float)
dz = A - Y_true
print(dz)

[ 1.67014218e-05  2.68941421e-01 -1.23394576e-04 -5.60279645e-09
 -2.54463117e-13]


In [None]:
np.set_printoptions(precision=3, suppress=True)
print(dz)

[ 0.     0.269 -0.    -0.    -0.   ]


In [None]:
np.set_printoptions(precision=5, suppress=False)

# Neural Network Demo


In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [None]:
df_mnist_train_small = pd.DataFrame(data=pd.read_csv("../content/sample_data/mnist_train_small.csv"))
df_mnist_train_small.head()

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_mnist_train_small.shape

(19999, 785)

## Neural Network: SoftMax Activation vs. ReLU Activation
Compare the accuracy of a neural network with ReLU activations for the hidden layer vs. one with Sigmoid activations for the hidden layer.

**About the MNIST Handwritten Digit Data**

Every line of these files consists of an image, i.e. numbers between 0 and 9. The first number of each line is the label, i.e. the digit which is depicted in the image. The following 784 numbers are the pixels of the 28 x 28 image.

In [None]:
df_mnist_train_small.iloc[: , : ].head()

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train Model with SoftMax Activations

In [None]:
import sklearn
from tensorflow.keras.losses import CategoricalCrossentropy

In [None]:
model_neuralnet_softmax = Sequential([
    Dense(units=25, activation="softmax"),
    Dense(units=15, activation="softmax"),
    Dense(units=10, activation="softmax"),
])

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [None]:
model_neuralnet_softmax.compile(loss=SparseCategoricalCrossentropy())
model_neuralnet_softmax.fit(df_mnist_train_small.iloc[: , 1:], df_mnist_train_small.iloc[: , 0:1], epochs=128)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128
Epoch 59/128
Epoch 60/128
Epoch 61/128
Epoch 62/128
Epoch 63/128
Epoch 64/128
Epoch 65/128
Epoch 66/128
Epoch 67/128
Epoch 68/128
Epoch 69/128
Epoch 70/128
Epoch 71/128
Epoch 72/128
Epoch 73/128
Epoch 74/128
Epoch 75/128
Epoch 76/128
Epoch 77/128
Epoch 78

<keras.callbacks.History at 0x7850fb9d3670>

### Train Model with ReLU Activations

In [None]:
# Using a ReLU as the output layer activation function results in a loss of nan, i.e. not a number.
model_neuralnet_relu = Sequential([
    Dense(units=25, activation="relu"),
    Dense(units=15, activation="relu"),
    Dense(units=10, activation="softmax"),
])

In [None]:
model_neuralnet_relu.compile(loss=SparseCategoricalCrossentropy())
model_neuralnet_relu.fit(df_mnist_train_small.iloc[: , 1: ], df_mnist_train_small.iloc[: , 0:1], epochs=128)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128
Epoch 59/128
Epoch 60/128
Epoch 61/128
Epoch 62/128
Epoch 63/128
Epoch 64/128
Epoch 65/128
Epoch 66/128
Epoch 67/128
Epoch 68/128
Epoch 69/128
Epoch 70/128
Epoch 71/128
Epoch 72/128
Epoch 73/128
Epoch 74/128
Epoch 75/128
Epoch 76/128
Epoch 77/128
Epoch 78

<keras.callbacks.History at 0x7850fae9caf0>

In [None]:
df_mnist_test = pd.DataFrame(data=pd.read_csv("../content/sample_data/mnist_test.csv"))
df_mnist_test.head()

Unnamed: 0,7,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.658,0.659,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y_pred_sm = model_neuralnet_softmax.predict(x=df_mnist_test.iloc[: , 1: ])
y_pred_sm[0:5]



array([[4.20090e-03, 5.17402e-03, 2.96455e-01, 3.03878e-01, 8.04293e-03,
        2.52159e-02, 3.22906e-01, 7.34398e-03, 2.29537e-02, 3.83010e-03],
       [7.83847e-04, 9.71213e-01, 2.09207e-03, 1.67747e-03, 2.36660e-03,
        2.14123e-03, 1.79525e-03, 1.73767e-03, 1.25027e-02, 3.68987e-03],
       [9.22901e-01, 9.23057e-04, 1.26196e-02, 5.62160e-03, 6.75974e-03,
        1.12174e-02, 1.00753e-02, 3.24117e-03, 9.61425e-03, 1.70268e-02],
       [1.24261e-03, 1.07864e-03, 1.09647e-02, 4.16875e-03, 4.74098e-01,
        6.29790e-03, 7.49343e-03, 1.73299e-02, 5.59712e-03, 4.71729e-01],
       [7.83847e-04, 9.71213e-01, 2.09207e-03, 1.67747e-03, 2.36660e-03,
        2.14123e-03, 1.79525e-03, 1.73767e-03, 1.25027e-02, 3.68987e-03]],
      dtype=float32)

In [None]:
y_pred_relu = model_neuralnet_relu.predict(x=df_mnist_test.iloc[ : , 1: ])
y_pred_relu[0:5]



array([[0.00000e+00, 1.57045e-20, 1.00000e+00, 1.76345e-22, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00],
       [0.00000e+00, 1.00000e+00, 1.19386e-38, 0.00000e+00, 0.00000e+00,
        4.09792e-33, 1.19374e-37, 3.86701e-38, 0.00000e+00, 1.26726e-30],
       [1.00000e+00, 0.00000e+00, 6.13459e-21, 1.33813e-22, 3.19460e-26,
        1.79963e-18, 2.43546e-18, 3.76386e-20, 0.00000e+00, 3.64662e-21],
       [0.00000e+00, 1.55309e-09, 1.93920e-13, 9.17151e-20, 9.99697e-01,
        2.06399e-18, 3.52792e-17, 2.41105e-07, 0.00000e+00, 3.02807e-04],
       [0.00000e+00, 1.00000e+00, 2.25434e-25, 1.37651e-37, 0.00000e+00,
        6.72296e-22, 1.26870e-24, 3.70681e-25, 1.11861e-37, 2.57065e-20]],
      dtype=float32)

In [None]:
df_mnist_test.iloc[ : , 0:1].head()

Unnamed: 0,7
0,2
1,1
2,0
3,4
4,1


## Recommended Implementation of Softmax Digit Classifier Neural Network

Use a linear activation function. This is the input (z) to the Sparse Categorical Cross Entropy function, with will take z values and apply the loss function.

In [None]:
model_neuralnet_softmax_fromlogits = Sequential([
    Dense(units=25, activation="softmax"),
    Dense(units=15, activation="softmax"),
    Dense(units=10, activation="linear"),
])
model_neuralnet_softmax_fromlogits.compile(loss=SparseCategoricalCrossentropy(from_logits=True))
model_neuralnet_softmax_fromlogits.fit(df_mnist_train_small.iloc[: , 1:], df_mnist_train_small.iloc[: , 0:1], epochs=128)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128
Epoch 59/128
Epoch 60/128
Epoch 61/128
Epoch 62/128
Epoch 63/128
Epoch 64/128
Epoch 65/128
Epoch 66/128
Epoch 67/128
Epoch 68/128
Epoch 69/128
Epoch 70/128
Epoch 71/128
Epoch 72/128
Epoch 73/128
Epoch 74/128
Epoch 75/128
Epoch 76/128
Epoch 77/128
Epoch 78

<keras.callbacks.History at 0x7850f0f032e0>

In [None]:
y_pred_sm_fl = model_neuralnet_softmax_fromlogits.predict(x=df_mnist_test.iloc[: , 1: ])
y_pred_sm_fl[0:5]



array([[-6.09708, -7.54079, -1.9256 , -5.43155, -6.4216 , -6.25328,
        -1.73158, -7.26212, -6.23182, -8.10736],
       [-9.97325, -2.54858, -9.16281, -9.18023, -8.51398, -9.42647,
        -9.67234, -9.05302, -8.14181, -9.18845],
       [-2.27404, -7.91402, -5.16354, -2.33255, -7.30065, -2.45445,
        -5.90886, -6.75759, -5.10265, -5.67199],
       [-5.36014, -6.91368, -4.80385, -5.36641,  0.45363, -4.48229,
        -5.06563, -4.525  , -5.09459, -2.41468],
       [-9.97325, -2.54858, -9.16281, -9.18023, -8.51398, -9.42647,
        -9.67234, -9.05302, -8.14181, -9.18845]], dtype=float32)

In [None]:
model_neuralnet_relu_fromlogits = Sequential([
    Dense(units=25, activation="relu"),
    Dense(units=15, activation="relu"),
    Dense(units=10, activation="linear"),
])
model_neuralnet_relu_fromlogits.compile(loss=SparseCategoricalCrossentropy(from_logits=True))
model_neuralnet_relu_fromlogits.fit(df_mnist_train_small.iloc[: , 1:], df_mnist_train_small.iloc[: , 0:1], epochs=128)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128
Epoch 59/128
Epoch 60/128
Epoch 61/128
Epoch 62/128
Epoch 63/128
Epoch 64/128
Epoch 65/128
Epoch 66/128
Epoch 67/128
Epoch 68/128
Epoch 69/128
Epoch 70/128
Epoch 71/128
Epoch 72/128
Epoch 73/128
Epoch 74/128
Epoch 75/128
Epoch 76/128
Epoch 77/128
Epoch 78

<keras.callbacks.History at 0x7850f0efc280>

In [None]:
y_pred_relu_fl = model_neuralnet_relu_fromlogits.predict(x=df_mnist_test.iloc[: , 1: ])
y_pred_sm_fl[0:5]



array([[-6.09708, -7.54079, -1.9256 , -5.43155, -6.4216 , -6.25328,
        -1.73158, -7.26212, -6.23182, -8.10736],
       [-9.97325, -2.54858, -9.16281, -9.18023, -8.51398, -9.42647,
        -9.67234, -9.05302, -8.14181, -9.18845],
       [-2.27404, -7.91402, -5.16354, -2.33255, -7.30065, -2.45445,
        -5.90886, -6.75759, -5.10265, -5.67199],
       [-5.36014, -6.91368, -4.80385, -5.36641,  0.45363, -4.48229,
        -5.06563, -4.525  , -5.09459, -2.41468],
       [-9.97325, -2.54858, -9.16281, -9.18023, -8.51398, -9.42647,
        -9.67234, -9.05302, -8.14181, -9.18845]], dtype=float32)

Softmax Activations:
Lowest without Logits: ~0.95. Lowest using Logits: ~0.87

ReLU with a Softmax output layer:
Lowest without Logits: ~0.86. Lowest using Logits: ~0.82.

## Compare Model Accuracy

In [None]:
from sklearn.metrics import balanced_accuracy_score

The predictions are arrays of arrays. Each first-level array gives a list of probabilities that the input matches a certain label. These have to be translated into predicted integer values for the balanced accuracy scorer to work.

## Intuition for Derivatives

### Exponent Rule of Derivatives.

In [None]:
import sympy

In [None]:
J, w = sympy.symbols('J, w')

In [None]:
J = w ** 2
J

w**2

In [None]:
dJ_dw = sympy.diff(J, w)

In [None]:
dJ_dw

2*w

In [None]:
dJ_dw.subs([(w, 2)])

4

In [None]:
J = w ** 3
dJ_dw = sympy.diff(J, w)
dJ_dw

3*w**2

In [None]:
dJ_dw.subs([(w, 2)])

12

## L2 Norm/Euclidean Distance of Vectors
Comparing the L2 Norm of a separately approximated and actual calculated derivative vector can be used to tell if your model is miscalculating derivatives.

In [None]:
from numpy import array
from numpy.linalg import norm

In [None]:
arr_appr = array([-1, -2, 3, 4, 5])
arr_calc = array([1, 2, 3, 4, 5])
norml2_appr = norm(arr_appr, 2)
norml2_calc = norm(arr_calc, 2)
grad_diff = norm((arr_appr - arr_calc), 2)/(norml2_appr + norml2_calc)
print(grad_diff)

0.30151134457776363


In [None]:
norml2_appr = norm(arr_appr, 2)
norml2_calc = norm(arr_calc, 2)
grad_diff = norm((arr_appr - (arr_calc * 2)), 2)/(norml2_appr + norml2_calc)
print(grad_diff)

0.6571287406727708


## Introduction to TensorFlow

In [None]:
w = tf.Variable(0, dtype=tf.float32)

In [None]:
from tensorflow.python.ops.variables import trainable_variables
optimizer = tf.keras.optimizers.Adam(0.1)

@tf.function
def train_step():
  with tf.GradientTape()  as tape:
    cost = w**2 - 10 * w + 25
  trainable_vars = [w]
  grads = tape.gradient(cost, trainable_vars)
  optimizer.apply_gradients(zip(grads, trainable_vars))

In [None]:
print(w)
train_step()
print(w)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>
<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.09999931>


In [None]:
for i in range(64):
  train_step()
print(w)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=4.6682186>


In [None]:
for i in range(512):
  train_step()
print(w)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=5.0000014>


### TensorFlow with Input Array x and Parameter w
The optimizer has to be initialized again to avoid errors.

In [None]:
optimizer = tf.keras.optimizers.Adam(0.1)
w = tf.Variable(0, dtype=tf.float32)
x = np.array([1, -10, 25], dtype = np.float32)

@tf.function
def training(x, w, optimizer, epochs = 512):
  def cost_fn():
    return x[0] * w**2 + x[1] * w + x[2]
  for i in range(epochs):
    optimizer.minimize(cost_fn, [w])
  print(w)
  return w

In [None]:
print(w)
w = training(x, w, optimizer)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>
<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=5.0000014>


## Distribution Strategies

### Mirrored Strategy
- One replica per device is trained. The replicas train in parallel. Parameters will be synchronized after every forward pass. The synchornization is done using "an all-reduce algorithm."

- As of August 2023, only Nvidia GPUs are supported. Synchronization is done using NVIDIA NCCL.

In [None]:
tfds.disable_progress_bar()

import os

In [None]:
datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)
mnist_train, mnist_test = datasets['train'], datasets['test']

In [None]:
strategy = tf.distribute.MirroredStrategy()
# When using GPUs of a different model, enable and account for cross-device operations.
# strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
print(f"Device count: {strategy.num_replicas_in_sync}")



Device count: 1


In [None]:
# Buffer size equivalent to 2^14
BUFFER_SIZE = 16384
BATCH_SIZE_PER_REPLICA = 512
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
# Set the minimum number of cores required for a GPU to be included in the replications.
os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '2'

In [None]:
# For data scaling, not relevant to distribution.
@tf.function
def scale(image, label):
  image = tf.cast(image, tf.float32)
  image /= 255

  return image, label

In [None]:
train_data = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
eval_data = mnist_test.map(scale).batch(BATCH_SIZE)

In [None]:
# Define model within scope of strategy.scope()
with strategy.scope():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10)
  ])


In [None]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

model.fit(train_data, epochs=16)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<keras.callbacks.History at 0x7f89b17efb80>

### TPU Strategy
TPUs are chips made for machine learning and are often slightly cheaper than other chip types. In Collab, make sure to change your runtime environment hardware to a TPU before using this strategy.

In [None]:
# Check if TPU is available.
try:
  # Get IP address and port. Accessible using RPC (Remote Procedure Call)
  tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  # Find TPU identifier/pointer for use in code.
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu_address)
  tf.config.experimental_connect_to_cluster(tpu)
  strategy = tf.distribute.TPUStrategy(tpu)
  print('Running on TPU worker:', tpu.cluster_spec().as_dict()['worker'])
  print('Number of accelerators: ', strategy.num_replicas_in_sync)
except:
  print('TPU did not initialize.')
  raise


Running on TPU worker: ['10.124.238.154:8470']
Number of accelerators:  8
