# Нейросеть на numpy

todo:

- добавить проверку на правильность dx в softmax_loss

### Линейный слой (forward, backward)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from source.helpme import eval_error, numerical_grad_array

In [2]:
from sklearn.datasets import load_digits

In [3]:
X, Y = load_digits(return_X_y=True)
X_train = X[:1500]
y_train = Y[:1500]

X_test = X[1500:]
y_test = Y[1500:]

print(X_train.shape, X_test.shape)

(1500, 64) (297, 64)


In [4]:
x = np.random.rand(16, 8*8*3)
w = np.random.rand(8*8*3, 64)
b = np.random.rand(64)

In [5]:
def linear_forward(x, w, b):
    out = x.dot(w) + b
    cache = {'x':x.copy(), 'w':w.copy(), 'b':b.copy()}
    return out, cache

def linear_backward(dout, cache):
    x = cache['x']
    w = cache['w']
    b = cache['b']
    
    dx = dout.dot(w.T)
    dw = x.T.dot(dout) # 192,16 x 16,64
    db = dout.sum(0)
    
    return (dx, dw, db)

In [6]:
out, cache = linear_forward(x, w, b)

Проверка: представим, что мы как-то посчитали ошибку и во время обратного распространения ошибки пришел тензор dout

In [7]:
# заполним dout случайными числами
dout = np.random.rand(16, 64)

# вычислим значения градиентов при таком dout (аналитически)
dx, dw, db = linear_backward(dout, cache) 

# вычислим численным методом значения градиентов
dx_ = numerical_grad_array(lambda x: linear_forward(x, w, b)[0], x, dout)
dw_ = numerical_grad_array(lambda w: linear_forward(x, w, b)[0], w, dout)
db_ = numerical_grad_array(lambda b: linear_forward(x, w, b)[0], b, dout)

# посмотрим на ошибки вычислений (значения должны быть меньше 1е-8)
print('dx error:', eval_error(dx, dx_))
print('dw error:', eval_error(dw, dw_))
print('db error:', eval_error(db, db_))

dx error: 1.2898659978448889e-11
dw error: 4.965990972815423e-10
db error: 4.658391882452255e-11


### ReLU

In [8]:
def relu_forward(x):
    x[x < 0] = 0
    cache = {'x' : x.copy()}
    return x, cache

def relu_backward(dout, cache):
    x = cache['x']
    dx = dout
    dx[x < 0] = 0
    return dx

### Softmax loss

-log(e^s_i / sum(e^s_j)) = log(sum(e^s_j)) - e^s_i

In [9]:
def softmax_loss(pred, true):
    pred = pred - np.max(pred, 1, keepdims=True)
    pred_exp = np.exp(pred)
    N = len(pred)
    
    S = np.sum(pred_exp, 1, keepdims=True)
    CE = pred - np.log(S)
    
    loss = -np.sum(CE[np.arange(N), true]) / N
    
    dx = np.exp(CE)
    dx[np.arange(N), true] -= 1
    dx /= N
    
    return loss, dx

In [10]:
true = np.random.randint(0, 10, 16)
pred = np.abs(np.random.randn(16, 10) * 5)

loss, dx = softmax_loss(pred, true)
print(loss)

7.115521616368678


### TwoLayerNet

In [11]:
class TwoLayerNet():
    def __init__(self, 
                 input_size : int, 
                 hidden_size : int, 
                 output_size : int, 
                 weight_scale : float = 1e-3):
        
        self.w1 = np.random.randn(input_size, hidden_size) * weight_scale
        self.b1 = np.zeros(hidden_size)
        
        self.w2 = np.random.randn(hidden_size, output_size) * weight_scale
        self.b2 = np.zeros(output_size)
        
    
    def train(self, x, y, batch_size=256, epochs=30, lr=5e-3):
        N = len(x)
        for e in range(1, epochs):
            epoch_loss = []
            for idx in range(0, N, batch_size):
                # pick batch of data
                x_batch = x[idx : min(idx+batch_size, N)]
                y_batch = y[idx : min(idx+batch_size, N)]
                
                # forward pass
                out1, cache1 = linear_forward(x_batch, self.w1, self.b1)
                out2, cache2 = relu_forward(out1)
                pred, cache3 = linear_forward(out2, self.w2, self.b2)
                loss, dx = softmax_loss(pred, y_batch)
                epoch_loss.append(loss)

                # backward pass
                dx, dw2, db2 = linear_backward(dx, cache3)
                dx = relu_backward(dx, cache2)
                dx, dw1, db1 = linear_backward(dx, cache1)

                # update weights
                self.w1 -= dw1 * lr
                self.b1 -= db1 * lr
                self.w2 -= dw2 * lr
                self.b2 -= db2 * lr
            
            print(f'epoch {e} | CEloss = {np.mean(epoch_loss)}')
            
    
    def predict(self, x, batch_size=256):
        N = len(x)
        pred = []
        for idx in range(0, N, batch_size):
            x_batch = x[idx : min(idx+batch_size, N)]
            
            out, _ = linear_forward(x_batch, self.w1, self.b1)
            out, _ = relu_forward(out)
            out, _ = linear_forward(out, self.w2, self.b2)
            
            pred.append(np.argmax(out, 1))
        return np.concatenate(pred)

In [12]:
nn = TwoLayerNet(64, 128, 10)

In [13]:
nn.train(X_train, y_train, epochs=500)

epoch 1 | CEloss = 2.302484112300393
epoch 2 | CEloss = 2.302278108731002
epoch 3 | CEloss = 2.302065164226423
epoch 4 | CEloss = 2.301840334014587
epoch 5 | CEloss = 2.3015981352004125
epoch 6 | CEloss = 2.3013327998310076
epoch 7 | CEloss = 2.3010389034408707
epoch 8 | CEloss = 2.3007098924639475
epoch 9 | CEloss = 2.30033849905476
epoch 10 | CEloss = 2.299916574373626
epoch 11 | CEloss = 2.299435751751154
epoch 12 | CEloss = 2.2988849458548724
epoch 13 | CEloss = 2.2982517228175614
epoch 14 | CEloss = 2.2975220223194626
epoch 15 | CEloss = 2.2966791110929656
epoch 16 | CEloss = 2.295704056569076
epoch 17 | CEloss = 2.2945743398477254
epoch 18 | CEloss = 2.2932636387148153
epoch 19 | CEloss = 2.2917403796606375
epoch 20 | CEloss = 2.2899689265895637
epoch 21 | CEloss = 2.2879078787798837
epoch 22 | CEloss = 2.28550837827912
epoch 23 | CEloss = 2.282714738788231
epoch 24 | CEloss = 2.2794621499313625
epoch 25 | CEloss = 2.2756760421791777
epoch 26 | CEloss = 2.2712712951939635
epoch 2

epoch 209 | CEloss = 0.11599810958984504
epoch 210 | CEloss = 0.11525599745642912
epoch 211 | CEloss = 0.11452491823957224
epoch 212 | CEloss = 0.11380494260407636
epoch 213 | CEloss = 0.11309522845137622
epoch 214 | CEloss = 0.11239422808442057
epoch 215 | CEloss = 0.11170160432200786
epoch 216 | CEloss = 0.1110182914972015
epoch 217 | CEloss = 0.11034457081311526
epoch 218 | CEloss = 0.10968011713609577
epoch 219 | CEloss = 0.10902460833387147
epoch 220 | CEloss = 0.10837774638401554
epoch 221 | CEloss = 0.10773932503495028
epoch 222 | CEloss = 0.10710933910543648
epoch 223 | CEloss = 0.10648657982513315
epoch 224 | CEloss = 0.10587174743596557
epoch 225 | CEloss = 0.10526506157536636
epoch 226 | CEloss = 0.10466588985372993
epoch 227 | CEloss = 0.10407447059329782
epoch 228 | CEloss = 0.10349012154816546
epoch 229 | CEloss = 0.10291304802525847
epoch 230 | CEloss = 0.10234311488774878
epoch 231 | CEloss = 0.10178017677446832
epoch 232 | CEloss = 0.10122416939940389
epoch 233 | CElos

epoch 420 | CEloss = 0.05187770880997237
epoch 421 | CEloss = 0.051745825791984036
epoch 422 | CEloss = 0.051614574524703194
epoch 423 | CEloss = 0.05148395462157476
epoch 424 | CEloss = 0.05135402286191992
epoch 425 | CEloss = 0.05122472005128679
epoch 426 | CEloss = 0.05109604205680568
epoch 427 | CEloss = 0.05096798597033323
epoch 428 | CEloss = 0.05084060171923699
epoch 429 | CEloss = 0.05071380883566074
epoch 430 | CEloss = 0.050587614339277796
epoch 431 | CEloss = 0.050462007857992534
epoch 432 | CEloss = 0.050336983564443556
epoch 433 | CEloss = 0.05021252083001631
epoch 434 | CEloss = 0.05008858954957174
epoch 435 | CEloss = 0.04996522642547025
epoch 436 | CEloss = 0.04984244164439113
epoch 437 | CEloss = 0.049720190935817345
epoch 438 | CEloss = 0.04959852980926558
epoch 439 | CEloss = 0.0494774320902611
epoch 440 | CEloss = 0.04935688701704527
epoch 441 | CEloss = 0.04923689078040692
epoch 442 | CEloss = 0.049117438298543405
epoch 443 | CEloss = 0.048998516120113335
epoch 444

In [14]:
y_pred = nn.predict(X_test)

In [15]:
(y_pred == y_test).mean()

0.9259259259259259