In [4]:
import numpy as np

In [None]:
np.random.seed(11)
C, N = 5, 500 # number of classes and number of points per class
means = [[2, 2], [8, 3], [3, 6], [14, 2], [12, 8]]
cov = [[1, 0], [0, 1]]
X0 = np.random.multivariate_normal(means[0], cov, N)
X1 = np.random.multivariate_normal(means[1], cov, N)
X2 = np.random.multivariate_normal(means[2], cov, N)
X3 = np.random.multivariate_normal(means[3], cov, N)
X4 = np.random.multivariate_normal(means[4], cov, N)
X = np.concatenate((X0, X1, X2, X3, X4), axis = 0) # each row is a datapoint
Xbar = np.concatenate((X, np.ones((X.shape[0], 1))), axis = 1) # bias trick
y = np.asarray([0]*N + [1]*N + [2]*N+ [3]*N + [4]*N) # label
# y shape of (2500,)
# Xbar shape of (2500, 3)
# X shape of (2500, 2)
W_init = np.random.randn(Xbar.shape[1], C)
# W_init shape of (3, 5)
"""
N_total = 2500, C = 5, d = 3
50
"""
# print(W_init.shape)
# print(W_init.shape)

In [18]:
def softmax_stable(Z):
    Z_exp = np.exp(Z - np.max(Z, axis = 0, keepdims=True))
    partition = np.sum(Z_exp, axis = 0, keepdims=True)
    return Z_exp / partition

In [19]:
def softmax_loss(X, y, W):
    """
    W: 2d numpy array of shape (d, C) each column corresponding to one output code.
    X: 2d numpy array of shape (N, d), each row is one datapoint
    y: 1d numpy array -- label of each row of X
    """
    A = softmax_stable(X.dot(W)) # shape of (N, C)
    return -np.mean(A[range(X.shape[0]), y])

- Khi biểu diễn dưới dạng toán học, mỗi điểm dữ liệu là một cột của ma trận X; nhưng khi làm việc với numpy, mỗi điểm dữ liệu được dọc theo **axis = 0** của mảng 2 chiều X. Việc này thống nhất với thư viện scikit-learn hay tensorflow ở việc X[i] được dùng để chỉ điểm dữ liệu thứ i, idexing từ 0. Tức là, nếu có N điểm dữ liệu trong không gian d chiều thì $X \in R^{d \times N}$ , nhưng **X.shape == (N, d)**.
- $W \in R^{d \times C}$ , **W.shape == (d, C)**.
- $W^TX$ sẽ được biểu diễn bởi **X.dot(W)** , và có **shape == (N, C)**.

In [29]:
def softmax_grad(X, y, W): # shape of (C, C)
    """
    W: 2d numpy array of shape (d, C), each column corresponding to one output node
    X: 2d numpy array of shape (N, d), each row is one data point
    y: 1d numpy array -- label of  each row of X
    """
    A = softmax_stable(X.dot(W)) # shape of (N, C)
    A[range(X.shape[0]), y] -= 1
    return X.T.dot(A)/X.shape[0]

In [21]:
mix_ids = np.random.permutation(N)
print(mix_ids)

[255  72 159 370 265 269 178  42 369 295 448 211 304 421 213 277 332  66
 253  21  24 432  22 388 392 201 228 330  88  29 323 291 157 499 347 446
 186 480 346 149 314   0  82 349 394 251 387  89 439 494 267 131 194 364
 252  53 490 147 240 170 399 107 464 216  95 272 105 266 325 215 373 275
 100 218 256  48 345 334 391 234 484  75 139 317 206 204  96 161 137 179
 456 127 196 283 335 285 344 125 109 496  56 130 279  60 142 222 313 319
 333 471 128 183 104 377 414  37 158 305 189 181 182  31 365 120 356 132
 470 350 156 450 482 254  19 433  91 324 418 312 241 180 242 164 208 362
 217 133 419 145 273 492  33  52 126 188  39 425  43  83 166 355 353 220
  69 416 372  13 271 383 306 118  12 374 410 398   3 486 340 260 270 449
  67 354  77   9   2 165 292 169  64 257 412  98  62 167 192   7 307 227
 452 469 402 341 316 226 487 474 390  41 326  51 303 485  23  97 258  26
 136 406 363 367 415  57 327 462 329 328 310 401 247 404 495 437 233 290
 225 360 173 473  28  40 435 212 311  17  93 455 45

In [23]:
def softmax_fit(X, y, W, lr = 0.01, nepochs = 100, dif = 1e-5, batch_size = 10):
    W_old = W.copy()
    ep = 0
    loss_hist = [softmax_loss(X, y, W)]
    N = X.shape[0]
    nbatches = int(np.ceil(float(N)/batch_size))
    while ep < nepochs:
        ep += 1
        mix_idxs = np.random.permutation(N) # stochastic
        for i in range(nbatches):
            batch_idxs = mix_idxs[batch_size*i : min(batch_size*(i+1), N)]
            X_batch, y_batch = X[batch_idxs], y[batch_idxs]
            W -= lr * softmax_grad(X_batch, y_batch, W)
            loss_hist.append(softmax_loss(X, y, W))
            if np.linalg.norm(W - W_old) / W.size < dif:
                break
            W_old = W.copy()
    return W, loss_hist

In [24]:
def predict(W, X):
    return np.argmax(X.dot(W), axis = 1)

In [30]:
W, loss_hist = softmax_fit(Xbar, y, W_init, lr = 0.05)
print(W)
print(loss_hist)

[[-2.29909127e-01  2.73033531e+02 -1.24337610e+02  1.70359311e+03
   1.39548814e+03]
 [ 1.63352185e-02  8.86828896e+01  5.22336996e+02  1.01781780e+02
   1.06992178e+03]
 [ 1.25092408e+02  1.27102251e+02  1.24799422e+02  1.25712911e+02
   1.25622280e+02]]
[-0.0002824436138549672, -0.0002562622909304972, -0.0002624733008551325, -0.0002491915254795699, -0.0002551863960762068, -0.00024970832508779107, -0.0002554662762563103, -0.0002829078699379755, -0.00028851471286928636, -0.0002702420833781945, -0.00026663136090232764, -0.0002625116873270845, -0.00027845203986719456, -0.00026361851588611526, -0.00023353744032134064, -0.00022360610663991687, -0.00025106178464434837, -0.0002677229514923169, -0.00028551495957168936, -0.0003177114306444029, -0.0003881130277601653, -0.0004475663066236253, -0.00046111948755896945, -0.00047310643032515945, -0.0005160735656382118, -0.0005740824166220799, -0.0006636788686384669, -0.000687031206748681, -0.0007184139856735366, -0.0007771669321515204, -0.0008309529