In [1]:
import numpy as np
import torch
import tensorflow as tf

In [2]:
torch.set_default_dtype(torch.float64)
tf.keras.config.set_floatx('float64')

In [3]:
def add_to_class(Class):  
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

# Dataset

## create dataset

In [4]:
from sklearn.datasets import make_classification

M: int = 100
N: int = 5
N_CLASS: int = 3

X, Y = make_classification(n_samples=M, n_features=N, n_classes=N_CLASS,
                           n_informative=N-1, n_redundant=0)

print(X.shape)
print(Y.shape)

(100, 5)
(100,)


## one hot encoding

In [5]:
Y_hat = torch.nn.functional.one_hot(torch.tensor(Y).long(), 3)
Y_hat.shape

torch.Size([100, 3])

## split dataset into train and valid

In [6]:
X_train, X_valid = torch.tensor(X[:85]), torch.tensor(X[85:])
Y_train, Y_valid = Y_hat[:85], Y_hat[85:]

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

torch.Size([85, 5]) torch.Size([85, 3])
torch.Size([15, 5]) torch.Size([15, 3])


# Model

## weights and bias

Trainables parameters
$$
\begin{align*}
\mathbf{W} &\in \mathbb{R}^{N \times Q} \\
\mathbf{b} &\in \mathbb{R}^{Q}
\end{align*}
$$
where $N$ is the number of features and $Q$ is the number of classes.

In [7]:
class SoftmaxClassifier:
    def __init__(self, n_features: int, n_classes: int):
        self.w = torch.randn(n_features, n_classes)
        self.b = torch.randn(n_classes)

    def copy_params(self, tf_model) -> None:
        """Copy the parameters from a TensorFlow model to this PyTorch model.

        Args:
            tf_model: A TensorFlow model from which to copy the parameters.

        Returns:
            None
        """
        self.w.copy_(torch.tensor(tf_model.weights[0].numpy()))
        self.b.copy_(torch.tensor(tf_model.weights[1].numpy()))

## weighted sum and softmax function

weighted sum
$$
\mathbf{Z}(\mathbf{X}) = \mathbf{X} \mathbf{W} + \mathbf{b} \\
\mathbf{Z} : \mathbb{R}^{M \times N} \rightarrow \mathbb{R}^{M \times Q}
$$

softmax function
$$
\sigma(\mathbf{Z}_{i,:})_{j} = \frac{\exp(\mathbf{Z}_{i,:})_{j}}
{\sum_{k=1}^{Q}(\exp(\mathbf{Z}_{i,:})_{k})}
$$
then:
$$
\sigma(\mathbf{Z}_{i,:}) = \begin{bmatrix}
    \sigma(\mathbf{Z}_{i,:})_{1} &
    \sigma(\mathbf{Z}_{i,:})_{2} &
    \cdots &
    \sigma(\mathbf{Z}_{i,:})_{Q}
\end{bmatrix}
$$
therefore:
$$
\sigma(\mathbf{Z}) = \begin{bmatrix}
    \sigma(\mathbf{Z}_{1,:}) \\
    \sigma(\mathbf{Z}_{2,:}) \\
    \vdots \\
    \sigma(\mathbf{Z}_{M,:})
\end{bmatrix} \\
\sigma(\mathbf{Z}) : \mathbb{R}^{M \times Q} \rightarrow \mathbb{R}^{M \times Q}
$$

In [8]:
@add_to_class(SoftmaxClassifier)
def predict(self, x: torch.Tensor) -> torch.Tensor:
    # weighted sum
    z = torch.matmul(x, self.w) + self.b
    # softmax function
    z_exp = torch.exp(z)
    y_pred = z_exp / z_exp.sum(1, keepdims=True)
    return y_pred

## CE

Loss function: Cross Entropy-loss:
$$
L(\mathbf{\hat{Y}}) = - \frac{1}{M} \sum_{i=1}^{M} \sum_{k=1}^{Q}(
    Y_{ik} \log(\hat{Y}_{ik})
) \\
L(\mathbf{\hat{Y}}) : \mathbb{R}^{M \times Q} \rightarrow \mathbb{R}
$$
**Remark**: for this case $\mathbf{\hat{Y}}$ is $\sigma(\mathbf{Z})$. It is not obligatory to use softmax for CE.<br>
Vectorized form:
$$
L(\mathbf{\hat{Y}}) = - \frac{1}{M} \sum_{i=1}^{M} \left(
    \mathbf{y}_{i,:}^T \log(\mathbf{\hat{y}}_{i,:})
\right)
$$
or
$$
L(\mathbf{\hat{Y}}) = - \frac{1}{M} \sum \left(
    \mathbf{Y} \odot \log(\mathbf{\hat{Y}})
\right)
$$

In [9]:
@add_to_class(SoftmaxClassifier)
def evaluate(self, x: torch.Tensor, y_true: torch.Tensor) -> float:
    y_pred = self.predict(x)
    loss = y_true * torch.log(y_pred)
    return - loss.sum().item() / len(y_true)

## Gradient

Gradient descent is:
$$
\frac{\partial L}{\partial \mathbf{W}} =
\frac{\partial L}{\partial \sigma}
\frac{\partial \sigma}{\partial \mathbf{Z}}
\frac{\partial \mathbf{Z}}{\partial \mathbf{W}}
$$
and
$$
\frac{\partial L}{\partial \mathbf{b}} =
\frac{\partial L}{\partial \sigma}
\frac{\partial \sigma}{\partial \mathbf{Z}}
\frac{\partial \mathbf{Z}}{\partial \mathbf{b}}
$$
where their shapes are:
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{W}} &\in \mathbb{R}^{N \times Q} \\
\frac{\partial L}{\partial \mathbf{b}} &\in \mathbb{R}^{Q} \\
\frac{\partial L}{\partial \sigma} &\in \mathbb{R}^{M \times Q} \\
\frac{\partial \sigma}{\partial \mathbf{Z}} &\in \mathbb{R}^{(M \times Q) \times (M \times Q)} \\
\frac{\partial \mathbf{Z}}{\partial \mathbf{W}} &\in \mathbb{R}^{(M \times Q) \times (N \times Q)} \\
\frac{\partial \mathbf{Z}}{\partial \mathbf{b}} &\in \mathbb{R}^{(M \times Q) \times Q}
\end{align*}
$$

### weighted sum derivative

#### $\frac{\partial \mathbf{Z}}{\partial \mathbf{W}}$
$$
\mathbf{X} = \begin{bmatrix}
    x_{11} & x_{12} & \cdots & x_{1N} \\
    x_{21} & x_{22} & \cdots & x_{2N} \\
    \vdots & \vdots & \ddots & \vdots \\
    x_{M1} & x_{M2} & \cdots & x_{MN}
\end{bmatrix} = \begin{bmatrix}
    \mathbf{x}_{1}^T \\
    \mathbf{x}_{2}^T \\
    \vdots \\
    \mathbf{x}_{M}^T
\end{bmatrix} \\
\mathbf{W} = \begin{bmatrix}
    w_{11} & w_{12} & \cdots & w_{1Q} \\
    w_{21} & w_{22} & \cdots & w_{2Q} \\
    \vdots & \vdots & \ddots & \vdots \\
    w_{N1} & w_{N2} & \cdots & w_{NQ} 
\end{bmatrix} = \begin{bmatrix}
    \mathbf{w}_{1} &
    \mathbf{w}_{2} &
    \cdots &
    \mathbf{w}_{Q}
\end{bmatrix}
$$
where $\mathbf{x}_{p}^T = \begin{bmatrix} x_{p1} & x_{p2} & \cdots & x_{pN} \end{bmatrix}$ 
and $\mathbf{w}_{q} = \begin{bmatrix} w_{1q} & w_{2q} & \cdots & w_{Nq} \end{bmatrix}^T$
, then
$$
\mathbf{XW} = \begin{bmatrix}
    \mathbf{x}_{1}^T \mathbf{w}_{1} & \mathbf{x}_{1}^T \mathbf{w}_{2} & \cdots & \mathbf{x}_{1}^T \mathbf{w}_{Q} \\
    \mathbf{x}_{2}^T \mathbf{w}_{1} & \mathbf{x}_{2}^T \mathbf{w}_{2} & \cdots & \mathbf{x}_{2}^T \mathbf{w}_{Q} \\
    \vdots & \vdots & \ddots & \vdots \\
    \mathbf{x}_{M}^T \mathbf{w}_{1} & \mathbf{x}_{M}^T \mathbf{w}_{2} & \cdots & \mathbf{x}_{M}^T \mathbf{w}_{Q}
\end{bmatrix}
$$
therefore:
$$
\frac{\partial \mathbf{XW}_{pq}}{\partial w_{ij}} = 
\frac{\partial \mathbf{x}_{p}^T\mathbf{w}_{q}}{\partial w_{ij}} = \begin{cases}
    x_{pi} & \text{ if } q=j \\ 
    0 & \text{ if } q\neq j 
\end{cases}
$$
for all $p = 1, ..., M$, $q, j = 1, ..., Q$ and $i = 1, ..., N$. <br>
Vectorized form:
$$
\frac{\partial \mathbf{Z}}{\partial \mathbf{W}} = 
\mathbb{I} \otimes \mathbf{X}
$$
where $\otimes$ is Kronecker product.

#### $\frac{\partial \mathbf{Z}}{\partial \mathbf{b}}$
$$
\mathbf{XW + b} = \begin{bmatrix}
    \mathbf{x}_{1}^T \mathbf{w}_{1}+b_1 & \mathbf{x}_{1}^T \mathbf{w}_{2}+b_2 & \cdots & \mathbf{x}_{1}^T \mathbf{w}_{Q}+b_Q \\
    \mathbf{x}_{2}^T \mathbf{w}_{1}+b_1 & \mathbf{x}_{2}^T \mathbf{w}_{2}+b_2 & \cdots & \mathbf{x}_{2}^T \mathbf{w}_{Q}+b_Q  \\
    \vdots & \vdots & \ddots & \vdots \\
    \mathbf{x}_{M}^T \mathbf{w}_{1}+b_1 & \mathbf{x}_{M}^T \mathbf{w}_{2}+b_2 & \cdots & \mathbf{x}_{M}^T \mathbf{w}_{Q}+b_Q 
\end{bmatrix}
$$
therefore:
$$
\frac{\partial \mathbf{(XW+b)}_{pq}}{\partial b_{i}} = 
\frac{\partial \mathbf{x}_{p}^T\mathbf{w}_{q} + b_{q}}{\partial b_{i}} = \begin{cases}
    1 & \text{ if } q=i \\ 
    0 & \text{ if } q\neq i
\end{cases}
$$
for all $p = 1, ..., M$ and $q,i = 1, ..., Q$.

### softmax derivative summary

First case:
$$
\frac{\partial \sigma(\mathbf{Z})_{p,:}}{\partial \mathbf{Z}_{i=p,:}} = \text{diag}(\sigma(\mathbf{Z}_{p,:})) - \sigma(\mathbf{Z}_{p,:}) \sigma(\mathbf{Z}_{p,:})^T
$$

Second case:
$$
\frac{\partial \sigma(\mathbf{Z})_{p,:}}{\partial \mathbf{Z}_{i\neq p,:}} = \mathbf{0}
$$
Please check [Softmax Function and Gradient](softmax_function_and_gradient.ipynb)

### cross-entropy derivative

We will use this CE:
$$
\begin{align*}
L(\mathbf{\hat{Y}}) &= - \frac{1}{M} \sum_{i=1}^{M} \left(
    \mathbf{y}_{i,:}^T \log(\mathbf{\hat{y}}_{i,:})
\right) \\
&= -\frac{1}{M} \left(
    \mathbf{y}_{1,:}^T \log(\mathbf{\hat{y}}_{1,:}) +
    \mathbf{y}_{2,:}^T \log(\mathbf{\hat{y}}_{2,:}) +
    ... + 
    \mathbf{y}_{M,:}^T \log(\mathbf{\hat{y}}_{M,:})
\right)
\end{align*}
$$
$$
\begin{align*}
\frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{p,:}} &= \begin{bmatrix}
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{p1}} & 
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{p2}} &
    \cdots &
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{pQ}}
\end{bmatrix} \in \mathbb{R}^{1 \times Q} \\
\frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{Y}}} &= \begin{bmatrix}
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{1,:}} \\
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{2,:}} \\
    \vdots \\
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{M,:}}
\end{bmatrix} \in \mathbb{R}^{M \times Q}
\end{align*}
$$
then
$$
\begin{align*}
\frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{p,:}} &=
-\frac{1}{M} \frac{\partial}{\partial \mathbf{\hat{y}}_{p,:}} \left(
    \mathbf{y}_{1,:}^T \log(\mathbf{\hat{y}}_{1,:}) +
    ... +
    \mathbf{y}_{p,:}^T \log(\mathbf{\hat{y}}_{p,:}) +
    ... + 
    \mathbf{y}_{M,:}^T \log(\mathbf{\hat{y}}_{M,:})
\right) \\
&= -\frac{1}{M} \frac{\partial}{\partial \mathbf{\hat{y}}_{p,:}} \left(
    \mathbf{y}_{p,:}^T \log(\mathbf{\hat{y}}_{p,:})
\right) \\
&= -\frac{1}{M} \left(
    \mathbf{y}_{p,:} \oslash \mathbf{\hat{y}}_{p,:}
\right)
\end{align*}
$$
where $\oslash$ is element-wise divide. <br>
Therefore
$$
\begin{align*}
\frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{Y}}} &= -\frac{1}{M} \begin{bmatrix}
    \mathbf{y}_{1,:} \oslash \mathbf{\hat{y}}_{1,:} \\
    \mathbf{y}_{2,:} \oslash \mathbf{\hat{y}}_{2,:} \\
    \vdots \\
    \mathbf{y}_{M,:} \oslash \mathbf{\hat{y}}_{M,:}
\end{bmatrix} \\
&= -\frac{1}{M} \mathbf{Y} \oslash \mathbf{\hat{Y}}
\end{align*}
$$

### pull it all together

$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{Z}} &=
\frac{\partial L}{\partial \sigma}
\frac{\partial \sigma}{\partial \mathbf{Z}} \\
&\in \mathbb{R}^{{\color{Cyan} (M \times Q)} \times ({\color{Cyan} M \times Q} \times {\color{Orange} M \times Q})} \\
&\in \mathbb{R}^{\color{Orange} M \times Q}
\end{align*}
$$

#### $\frac{\partial L}{\partial \mathbf{W}}$
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{W}} &=
\frac{\partial L}{\partial \mathbf{Z}} 
\frac{\partial \mathbf{Z}}{\partial \mathbf{W}} \\
&\in \mathbb{R}^{{\color{Orange} (M \times Q)} \times ({\color{Orange} M \times Q} \times {\color{Magenta} N \times Q})} \\
&\in \mathbb{R}^{\color{Magenta} N \times Q}
\end{align*}
$$

#### $\frac{\partial L}{\partial \mathbf{b}}$
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{b}} &=
\frac{\partial L}{\partial \mathbf{Z}} 
\frac{\partial \mathbf{Z}}{\partial \mathbf{b}} \\
&\in \mathbb{R}^{{\color{Orange} (M \times Q)} \times ({\color{Orange} M \times Q} \times {\color{Magenta} Q})} \\
&\in \mathbb{R}^{\color{Magenta} Q}
\end{align*}
$$

#### Note
We can simply compute:
$$
\frac{\partial L}{\partial \mathbf{W}} = 
\frac{1}{M} \mathbf{X}^T \left(
    \sigma - \mathbf{Y}
\right)
$$
but the objective of this repository is to understand piece by piece and not simply give things for done.

In [10]:
@add_to_class(SoftmaxClassifier)
def update(self, x: torch.Tensor, y_true: torch.Tensor,
           y_pred: torch.Tensor, lr: float) -> None:
    m, n = x.shape
    m, n_classes = y_true.shape

    # cross entropy der
    delta = -(y_true / y_pred) / m

    # softmax der
    soft_der = torch.zeros((m, n_classes, m, n_classes))
    for i in range(m): soft_der[i,:,i,:] = torch.diag(y_pred[i,:]) - torch.outer(y_pred[i,:], y_pred[i,:])
    delta = torch.einsum('pq,pqij->ij', delta, soft_der)

    # weighted sum der
    identity = torch.eye(n_classes)

    ## weight der
    w_der = torch.kron(x.unsqueeze(1).unsqueeze(3), identity.unsqueeze(0).unsqueeze(2))
    w_der = torch.einsum('pq,pqij->ij', delta, w_der)
    self.w -= lr * w_der

    ## bias der
    self.b  -= lr * delta.sum(axis=0)

## Metrics

In [11]:
@add_to_class(SoftmaxClassifier)
def accuracy(self, y_true, y_pred):
    preds = y_pred.argmax(axis=-1)
    compare = (y_true.argmax(axis=-1) == preds).type(torch.float32)
    return compare.mean().item()

## Fit

In [12]:
@add_to_class(SoftmaxClassifier)
def fit(self, x_train: torch.Tensor, y_train: torch.Tensor, 
        epochs: int, lr: float, batch_size: int, 
        x_valid: torch.Tensor, y_valid: torch.Tensor) -> None:
    """fit the model using gradient descent.

    Args:
        x_train: Input tensor of shape (n_samples, num_features).
        y_train: Target tensor one hot of shape (n_samples, n_classes).
        epochs: Number of epochs to train.
        lr: learning rate (0, 1).
        batch_size: Int number of batch.
        x_valid: Input tensor of shape (n_valid_samples, num_features).
        y_valid: Input tensor one hot of shape (n_valid_samples, n_valid_classes).
    """
    for epoch in range(epochs):
        loss = 0
        num_batch = 0
        for batch in range(0, len(y_train), batch_size):
            num_batch += 1
            x_b = x_train[batch:batch+batch_size]
            y_b = y_train[batch:batch+batch_size]

            y_pred = self.predict(x_b)
            loss += self.evaluate(x_b, y_b)

            self.update(x_b, y_b, y_pred, lr)

        loss = round(loss / num_batch, 4)
        loss_v = round(self.evaluate(x_valid, y_valid), 4)
        acc = round(self.accuracy(y_valid, self.predict(x_valid)), 4)
        print(f'epoch: {epoch} - MSE: {loss} - MSE_v: {loss_v} - acc: {acc}')

# Scratch vs TF

## hyperparameters

In [13]:
LR = 0.01
EPOCHS = 16
BATCH = len(X_train) // 3

## models

### TF model

In [14]:
TFModel = tf.keras.Sequential([
    tf.keras.layers.Dense(units=N_CLASS, activation='softmax')
])

TFModel.compile(
    loss = tf.keras.losses.CategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.SGD(learning_rate=LR),
    metrics = [tf.keras.metrics.Accuracy()]
)

TFModel.evaluate(X_train[:1], Y_train[:1])

TFModel.summary()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 558ms/step - accuracy: 0.0000e+00 - loss: 0.4980


### Scratch model

In [15]:
model = SoftmaxClassifier(N, N_CLASS)
model.copy_params(TFModel)

## comparison

In [16]:
def error(tensor_true, tensor_pred) -> float:
    """
     Calculates the percentage error between two tensors or floats.

     If the arguments are simple floats or ints, calculate the percentage error between them.
     If the arguments are Numpy ndarray and PyTorch tensor, calculate the percentage error between them.
     If the argumens are PyTorch tensors, calculate the percentage error between them.

     Args:
         tensor_true: The true tensor or true float.
         pred_tensor: The predicted tensor or the predicted float.

     Returns:
         The percentage error between the tensors or floats.
     """
    if isinstance(tensor_true, (float, int)) and isinstance(tensor_pred, (float, int)):
        return np.abs(tensor_true - tensor_pred) / np.abs(tensor_true) * 100
    elif type(tensor_true) is np.ndarray and type(tensor_pred) is torch.Tensor:
        e = np.abs(tensor_true - tensor_pred.numpy()) / np.abs(tensor_true)
        return np.mean(e) * 100
    e = torch.abs(tensor_true - tensor_pred) / torch.abs(tensor_true)
    return torch.mean(e) * 100

### predict

In [17]:
tf_predict = TFModel.predict(X_train, batch_size=len(X_train))
predict = model.predict(X_train)

error(tf_predict, predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step


1.316886891720421e-14

### CE

In [18]:
tf_predict = TFModel.evaluate(X_train, Y_train, batch_size=len(X_train))[0]
predict = model.evaluate(X_train, Y_train)

error(tf_predict, predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.0000e+00 - loss: 1.6837


0.0

### fit

In [19]:
TFModel.fit(X_train, Y_train, batch_size=BATCH, epochs=EPOCHS,
            shuffle=False, validation_data=(X_valid, Y_valid))

Epoch 1/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.0000e+00 - loss: 1.8048 - val_accuracy: 0.0000e+00 - val_loss: 1.9880
Epoch 2/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.0000e+00 - loss: 1.7202 - val_accuracy: 0.0000e+00 - val_loss: 1.9247
Epoch 3/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.0000e+00 - loss: 1.6423 - val_accuracy: 0.0000e+00 - val_loss: 1.8664
Epoch 4/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.0000e+00 - loss: 1.5718 - val_accuracy: 0.0000e+00 - val_loss: 1.8133
Epoch 5/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0000e+00 - loss: 1.5091 - val_accuracy: 0.0000e+00 - val_loss: 1.7651
Epoch 6/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.0000e+00 - loss: 1.4544 - val_accuracy: 0.0000e+00 - val_loss: 1.721

<keras.src.callbacks.history.History at 0x19439fb3250>

In [20]:
TFModel.history.history['loss']

[1.9439420934033271,
 1.8021994481590227,
 1.6718211303221795,
 1.5545473262674114,
 1.4516629543920239,
 1.3635510372070827,
 1.2895111117048481,
 1.2279633852114837,
 1.1768915756793612,
 1.1342669605696454,
 1.098304917643161,
 1.0675545585920798,
 1.0408871511852702,
 1.0174434438939994,
 0.9965742493713193,
 0.9777884971389093]

In [21]:
model.fit(X_train, Y_train, EPOCHS, LR, BATCH, X_valid, Y_valid)

epoch: 0 - MSE: 1.9439 - MSE_v: 1.988 - acc: 0.2
epoch: 1 - MSE: 1.8022 - MSE_v: 1.9247 - acc: 0.2
epoch: 2 - MSE: 1.6718 - MSE_v: 1.8664 - acc: 0.2
epoch: 3 - MSE: 1.5545 - MSE_v: 1.8133 - acc: 0.2667
epoch: 4 - MSE: 1.4517 - MSE_v: 1.7651 - acc: 0.2667
epoch: 5 - MSE: 1.3636 - MSE_v: 1.7216 - acc: 0.3333
epoch: 6 - MSE: 1.2895 - MSE_v: 1.6824 - acc: 0.3333
epoch: 7 - MSE: 1.228 - MSE_v: 1.6468 - acc: 0.3333
epoch: 8 - MSE: 1.1769 - MSE_v: 1.6145 - acc: 0.3333
epoch: 9 - MSE: 1.1343 - MSE_v: 1.5849 - acc: 0.3333
epoch: 10 - MSE: 1.0983 - MSE_v: 1.5576 - acc: 0.3333
epoch: 11 - MSE: 1.0676 - MSE_v: 1.5322 - acc: 0.3333
epoch: 12 - MSE: 1.0409 - MSE_v: 1.5085 - acc: 0.3333
epoch: 13 - MSE: 1.0174 - MSE_v: 1.4863 - acc: 0.3333
epoch: 14 - MSE: 0.9966 - MSE_v: 1.4653 - acc: 0.3333
epoch: 15 - MSE: 0.9778 - MSE_v: 1.4454 - acc: 0.2667


In [22]:
tf_predict = TFModel.weights[0].numpy()
predict = model.w

error(tf_predict, predict)

1.641674717706504e-14

In [23]:
tf_predict = TFModel.weights[1].numpy()
predict = model.b

error(tf_predict, predict)

1.5868532473458515e-14

# Full train

In [24]:
model2 = SoftmaxClassifier(N, N_CLASS)

print(model2.b)
print(model2.w)

tensor([-0.4401, -1.4111, -0.8354])
tensor([[ 1.0526e+00, -5.9961e-01,  2.8065e-02],
        [-2.3228e-01,  9.5730e-01,  4.8130e-01],
        [-1.5735e+00, -7.7481e-01,  2.9745e-01],
        [-1.4995e-02,  6.8289e-01,  2.2599e-04],
        [-9.1810e-01, -1.0410e+00,  1.2621e+00]])


In [25]:
model2.evaluate(X_valid, Y_valid)

1.9475972815460652

In [26]:
model2.fit(X_train, Y_train, 350, 0.001, 1, X_valid, Y_valid)

epoch: 0 - MSE: 2.1194 - MSE_v: 1.8911 - acc: 0.4667
epoch: 1 - MSE: 2.0286 - MSE_v: 1.8378 - acc: 0.4667
epoch: 2 - MSE: 1.9436 - MSE_v: 1.7874 - acc: 0.4667
epoch: 3 - MSE: 1.8641 - MSE_v: 1.7397 - acc: 0.4667
epoch: 4 - MSE: 1.7899 - MSE_v: 1.6945 - acc: 0.4667
epoch: 5 - MSE: 1.7207 - MSE_v: 1.6515 - acc: 0.4667
epoch: 6 - MSE: 1.6563 - MSE_v: 1.6105 - acc: 0.4667
epoch: 7 - MSE: 1.5962 - MSE_v: 1.5714 - acc: 0.4667
epoch: 8 - MSE: 1.5403 - MSE_v: 1.5341 - acc: 0.4667
epoch: 9 - MSE: 1.4881 - MSE_v: 1.4984 - acc: 0.4667
epoch: 10 - MSE: 1.4395 - MSE_v: 1.4643 - acc: 0.4667
epoch: 11 - MSE: 1.3941 - MSE_v: 1.4317 - acc: 0.4667
epoch: 12 - MSE: 1.3518 - MSE_v: 1.4005 - acc: 0.4667
epoch: 13 - MSE: 1.3122 - MSE_v: 1.3706 - acc: 0.4667
epoch: 14 - MSE: 1.2753 - MSE_v: 1.3421 - acc: 0.5333
epoch: 15 - MSE: 1.2409 - MSE_v: 1.3148 - acc: 0.5333
epoch: 16 - MSE: 1.2087 - MSE_v: 1.2886 - acc: 0.5333
epoch: 17 - MSE: 1.1786 - MSE_v: 1.2636 - acc: 0.5333
epoch: 18 - MSE: 1.1504 - MSE_v: 1.239

In [27]:
pred = model2.predict(X_valid)
pred.argmax(axis=-1)

tensor([0, 0, 0, 0, 0, 0, 2, 0, 2, 1, 1, 1, 2, 0, 2])

In [28]:
Y_valid.argmax(axis=-1)

tensor([0, 0, 0, 0, 2, 0, 2, 0, 2, 1, 1, 1, 2, 0, 0])

In [29]:
model2.accuracy(Y_valid, pred)

0.8666666746139526