In [9]:
import numpy as np

A = np.arange(12).reshape(3, 4)  # shape (3,4)
# [[ 0, 1, 2, 3],
#  [ 4, 5, 6, 7],
#  [ 8, 9,10,11]]
y = np.array([1, 3, 0])          # per-row target columns

print(A[:, y])
# shape (3,3): columns [1,3,0] for EVERY row
# [[ 1, 3, 0],
#  [ 5, 7, 4],
#  [ 9,11, 8]]

print(A[np.arange(3), y])
# shape (3,): pairwise picks -> [1, 7, 8]


[[ 1  3  0]
 [ 5  7  4]
 [ 9 11  8]]
[1 7 8]


In [None]:
import numpy as np


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def softmax(x):
    x = x - np.max(x, axis=1, keepdims=True)
    expx = np.exp(x)
    return expx / np.sum(expx, axis=1, keepdims=True)


class Linear:

    def __init__(self, in_size, hidden_size, out_size):
        self.in_size = in_size
        self.hidden_size = hidden_size
        self.out_size = out_size

        self.W1 = np.random.randn(in_size, hidden_size)
        self.b1 = np.zeros((hidden_size,))
        self.W2 = np.random.randn(hidden_size, out_size)
        self.b2 = np.zeros((out_size,))

    def forward(self, X):
        h1 = X @ self.W1 + self.b1
        a1 = np.where(h1 > 0, h1, 0)  # ReLU
        logits = a1 @ self.W2 + self.b2
        y_pred = softmax(logits)
        return y_pred

    def fit(self, X, y, lr=0.1, num_epochs=1000):
        B, _ = X.shape
        for epoch in range(num_epochs):
            # forward pass
            h = X @ self.W1 + self.b1
            a = np.where(h > 0, h, 0)  # (B, hidden_size)
            z = a @ self.W2 + self.b2
            # TODO: we can implement a even more stable CE by using full log-sum-exp trick 
            y_pred = softmax(z)

            # compute loss
            # cross entropy with constant label
            loss = np.mean(-np.log(y_pred[np.arange(B), y]))

            # backward pass
            dz = y_pred
            dz[np.arange(B), y] -= 1  # y - t for true labels, (B, out)
            dz /= B

            dW2 = a.T @ dz  # (hidden_size, out)
            db2 = np.sum(dz, axis=0)  # (out, )
            dh = np.where(h > 0, 1, 0)  # (B, hidden_size)
            dW1 = X.T @ ((dz @ self.W2.T) * dh)  # (input_size, hidden_size)
            db1 = np.sum((dz @ self.W2.T) * dh, axis=0)  # (hidden_size, )

            # optimizer.step
            self.W1 -= lr * dW1
            self.b1 -= lr * db1
            self.W2 -= lr * dW2
            self.b2 -= lr * db2

            # logging
            if epoch % 100 == 0:
                loss = float(loss)
                print(f"[{epoch}] {loss=}")

    def predict(self, X):
        probs = self.forward(X)
        return np.argmax(probs, axis=1)

In [11]:
def make_blobs(n_per_class=200, centers=None, std=0.6, seed=7):
    rng = np.random.default_rng(seed)
    if centers is None:
        centers = np.array([[0, 0], [3, 3], [-3, 3]], dtype=float)
    X_list, y_list = [], []
    for c, mu in enumerate(centers):
        Xc = rng.normal(loc=mu, scale=std, size=(n_per_class, len(mu)))
        yc = np.full(n_per_class, c, dtype=int)
        X_list.append(Xc); y_list.append(yc)
    X = np.vstack(X_list)
    y = np.concatenate(y_list)
    # shuffle
    idx = rng.permutation(len(y))
    return X[idx], y[idx]

# ----- Train/test split -----
X, y = make_blobs(n_per_class=300, std=0.7)
n = X.shape[0]
split = int(0.8 * n)
X_train, y_train = X[:split], y[:split]
X_test,  y_test  = X[split:], y[split:]

print(y_train.shape)

(720,)


In [12]:

# ----- Train -----
model = Linear(in_size=2, hidden_size=16, out_size=3)
model.fit(X_train, y_train, lr=0.1, num_epochs=2000)

# ----- Evaluate -----
pred = model.predict(X_test)
acc = (pred == y_test).mean()
print(f"Test accuracy: {acc*100:.2f}%  ({pred.size} samples)")

[0] loss=4.901427725148519
[100] loss=0.028145309817458546
[200] loss=0.014806726227682636
[300] loss=0.009798650021765463
[400] loss=0.007541345095430231
[500] loss=0.006238628255491287
[600] loss=0.005357961550273926
[700] loss=0.004718240849586416
[800] loss=0.004233307325909816
[900] loss=0.003852586202244709
[1000] loss=0.0035425013323305537
[1100] loss=0.003283918762146743
[1200] loss=0.0030647214804978068
[1300] loss=0.002876166456884398
[1400] loss=0.002712076377694536
[1500] loss=0.0025677703415072067
[1600] loss=0.0024392267816844776
[1700] loss=0.0023243318311906816
[1800] loss=0.00222097767594377
[1900] loss=0.0021275143077050043
Test accuracy: 100.00%  (180 samples)
