In [25]:
import numpy as np
np.random.seed(686)
import tqdm

In [26]:
# ================================
# ===== Data Load and Prep =======
# ================================

""" prep binary classification data for mnist 4s vs 9s"""

from keras.datasets import mnist

dataset = mnist.load_data()

In [27]:
DIG_A, DIG_B = 4, 9
SIDE = 28
MAX_PIX_VAL = 255
NB_TRAIN = 1000

In [28]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# x_train is N_ x SIDE x SIDE array of integers [0,255]
# x_train is N_  array of integers [0,9]
# where N_ = 60000

In [29]:
# ===== keep only two digits of interest ==================
def filter_digits(xs,ys):
    indices = np.logical_or(
        np.equal(ys, DIG_A),
        np.equal(ys, DIG_B))

    xs = xs[indices]
    ys = ys[indices]
    return xs, ys

x_train, y_train = filter_digits(x_train, y_train)
x_test, y_test = filter_digits(x_test, y_test)

# x_train is N x SIDE x SIDE array of integers [0,255]
# x_train is N  array of integers [4,9]
# where N ~ 12000

In [30]:
# ===== shape ==========================================
assert len(x_train) == len(y_train)
N = len(x_train)
N

11791

In [31]:
# ======= normalize pixel intensities ======================
x_train = x_train / float(MAX_PIX_VAL)
x_test = x_test / float(MAX_PIX_VAL)

# x_train is N x SIDE x SIDE array of integers [0., 1.]
# x_train is N  array of integers [4,9]
# where N ~ 12000

In [32]:
# ======= shuffle data ======================================
def shuffle(xs, ys):
    indices = np.arange(len(xs))
    np.random.shuffle(indices) # mutating shuffle
    xs = xs[indices]
    ys = ys[indices]
    return xs, ys

x_train, y_train = shuffle(x_train, y_train)
x_test, y_test = shuffle(x_test, y_test)


# ======= pare down training set ============================
x_train = x_train[:NB_TRAIN]
y_train = y_train[:NB_TRAIN]

# ======= add noise to pixel intensities ===============================
x_train = x_train + np.random.randn(*x_train.shape)
x_train = np.maximum(0., np.minimum(1., x_train))

# ===== shape ==========================================
assert len(x_train) == len(y_train)
N = len(x_train)


In [33]:
# ==== Measure proximity
# close_enough = lambda a, b : abs(b-a) < 1e-6
close_enough = lambda a, b : np.linalg.norm((b - a).flatten()) < 1e-6

In [34]:
# ======= Sanity Checks =====================================
assert x_train.shape == (N, SIDE, SIDE)
assert y_train.shape == (N,)
assert set(y_train) == {DIG_A, DIG_B}
assert close_enough(np.min(x_train), 0.)
assert close_enough(np.max(x_train), 1.)
assert abs(N-min(NB_TRAIN, 12000)) < 500

print("hooray!")
print("prepped {} training examples".format(N))

hooray!
prepped 1000 training examples


In [35]:
# ====== Metric for Success =========================================

""" acc, loss 

    prob p represent model's prob mass for DIG_B
    by 'predictor' I mean a function that takes in an image and gives a prob
"""

def accuracy(predicted_labels, true_ys):
    return np.mean([1. if l==y else 0.
                    for l, y in zip(predicted_labels, true_ys)])
    
def cross_entropy_loss(predicted_probs, true_ys):
    return np.mean([ - np.log(p if y==DIG_B else 1.-p) 
                    for p,y in zip(predicted_probs, true_ys)])

def judge(predictor, xs, ys, verbose=False):
    xs = tqdm.tqdm(xs) if verbose else xs
    probs = [predictor(x) for x in xs]
    labels = [DIG_B if p > 0.5 else DIG_A for p in probs]
    acc = accuracy(labels, ys)
    loss = cross_entropy_loss(probs, ys)
    return {"acc": acc, "loss": loss}

In [36]:
# ======= sanity checks using placeholder =========================================
very_sure_A = lambda x : .01
very_sure_B = lambda x : .99
maybe_its_A = lambda x : .4
maybe_its_B = lambda x : .6
fifty_fifty = lambda x : .5

vsa = judge(very_sure_A, x_train, y_train)["acc"]
vsb = judge(very_sure_B, x_train, y_train)["acc"]
assert close_enough(vsa + vsb, 1.)

vsa = judge(very_sure_A, x_train[:1], [DIG_A])["acc"]
vsb = judge(very_sure_A, x_train[:1], [DIG_B])["acc"]
assert close_enough(vsa, 1.)
assert close_enough(vsb, 0.)


vsa = judge(very_sure_A, x_train, y_train)["loss"]
vsb = judge(very_sure_B, x_train, y_train)["loss"]
mia = judge(maybe_its_A, x_train, y_train)["loss"]
mib = judge(maybe_its_B, x_train, y_train)["loss"]
ffl = judge(fifty_fifty, x_train, y_train)["loss"]
assert ffl < mia < vsa
assert ffl < mib < vsb
assert close_enough(ffl, np.log(2))

print("hooray!")

0.6931471805599454 0.7184237591173707 0.7086925965227748
hooray!


In [37]:
# ===========================================================================
# ====== LINEAR MODEL =======================================================
# ===========================================================================

In [38]:
# ===========================================================================    
# ====== Manipulate Weights ======================================================
linear_init = lambda : np.random.randn(SIDE*SIDE) / np.sqrt(SIDE*SIDE)

def linear_displace(w, coef, g):
    return w + coef * g

# ===========================================================================    
# ====== Forward Model ======================================================

clip = lambda z : np.maximum(-15., np.minimum(+15., z))
sigmoid = lambda z : 1./(1 + np.exp(-clip(z)))

def linear_predict(w, x):
    return sigmoid(w.dot(x.flatten()))

In [39]:
# sanity checks

w = linear_init()

vsa = judge(lambda x : linear_predict(+w, x), x_train, y_train)["acc"]
vsb = judge(lambda x : linear_predict(-w, x), x_train, y_train)["acc"]
assert close_enough(vsa + vsb, 1.)

ffl = judge(lambda x: linear_predict(0*w, x), x_train, y_train)["loss"]
assert close_enough(ffl, np.log(2))

x = w.reshape(SIDE, SIDE)
vsa = judge(lambda x: linear_predict(w, x), [x], [DIG_A])["acc"]
vsb = judge(lambda x: linear_predict(w, x), [x], [DIG_B])["acc"]
assert close_enough(vsa, 0.)
assert close_enough(vsb, 1.)

print("hooray!")

hooray!


In [40]:
# ========================================================================
# ======= Backward pass ==================================================

""" For given x, y, we want derivative (with respect w) of
    l(w) = loss(sigmoid(w.dot(x)), y)
         = loss_at_y(sigmoid(dot_with_x(w))
    where loss_at_y(p) = -log(p if y == DIG_B else 1-p)
    where sigmoid(z) = 1/(1+exp(-z))
    where dot_with_x(w) = w.dot(x)

By CHAIN RULE : 
    l'(w) = (
          loss_at_y'(sigmoid(dot_with_x(w)))
        * sigmoid'(dot_with_x(w))
        * dot_with_x'(w)
    ) = (
          loss_at_y'(p)
        * sigmoid'(z)
        * dot_with_x'(w)
    )
    where z = dot_with_x(w)
    where p = sigmoid(z)
    NOTE: appearance of terms from forward pass!
"""

def linear_backprop_unsimp(w, x, y):
    z = w.dot(x.flatten())
    p = sigmoid(z)
    # this is correct ...
    dl_dp = - (+1 if y==DIG_B else -1)/(p if y==DIG_B else 1-p)
    dp_dz = p * (1 - p)
    dz_dw = x.flatten()
    #
    dl_dw = dl_dp * dp_dz * dz_dw
    return dl_dw

def linear_backprop(w, x, y):
    z = w.dot(x.flatten())
    p = sigmoid(z)
    # ..... and so is this
    """
    dl_dp = -1/p if y==DIG_B else 1/(1-p)
    dp_dz = p * (1 - p)
    """
    # interpret dl_dz as error of p as estimator of one-hot version of y
    dl_dz = p - (1 if y==DIG_B else 0)
    dz_dw = x.flatten()
    #
    dl_dw = dl_dz * dz_dw
    return dl_dw

In [41]:
# sanity checks
for _ in range(10):
    w = linear_init()
    idx = np.random.randint(N)
    x = x_train[idx]
    y = y_train[idx]

    # check that simplification preserved answer
    g_unsimp = linear_backprop_unsimp(w, x, y)
    g              = linear_backprop(w, x, y)
    assert close_enough(g_unsimp, g)

    # do a step of gradient descent, check loss decreased
    before = judge(lambda xx : linear_predict(w, xx), [x], [y])["loss"]
    w = linear_displace(w, -0.01, g)
    after = judge(lambda xx: linear_predict(w, xx), [x], [y])["loss"]
    assert after < before

print("hooray!")

hooray!


In [42]:
# ===========================================================================
# ====== VANILLA MODEL ======================================================
# ===========================================================================

In [43]:
# ===========================================================================    
# ====== Weight Helpers =====================================================

D0 = SIDE*SIDE
D1 = 32
D2 = 32
D3 = 1

def vanilla_init():
    A = np.random.randn(D3, D2) / np.sqrt( 1 + D2)
    B = np.random.randn(D2, D1) / np.sqrt(D2 + D1)
    C = np.random.randn(D1, D0) / np.sqrt(D1 + D0)
    return (A,B,C)

def vanilla_displace(abc, coef, g):
    A, B, C = abc
    gA, gB, gC = g
    return (A + coef * gA,
            B + coef * gB,
            C + coef * gC)

In [44]:
""" what architecture? well, let's use this one:
    
    lrelu(z) = max(z/10, z)
    
    x
   h0 ---------> z1 -----> h1 ---------> z2 ----> h2 --------> z3 ------> p
    |
    |             |         |                                               
    |             |         |             |        |                        
    |     C*      | lrelu   |      B*     | lrelu  |     A*    | sigmoid  |
    |             |         |             |        |                      
    |             |         |                      1                      
    |                       1
    1                                                                   
    D0            D1        D1            D2       D2          1          1
    SIDE*SIDE     32                      32                   1          1
    
    D1 = 32; D2 = 32            
    """

" what architecture? well, let's use this one:\n    \n    lrelu(z) = max(z/10, z)\n    \n    x\n   h0 ---------> z1 -----> h1 ---------> z2 ----> h2 --------> z3 ------> p\n    |\n    |             |         |                                               \n    |             |         |             |        |                        \n    |     C*      | lrelu   |      B*     | lrelu  |     A*    | sigmoid  |\n    |             |         |             |        |                      \n    |             |         |                      1                      \n    |                       1\n    1                                                                   \n    D0            D1        D1            D2       D2          1          1\n    SIDE*SIDE     32                      32                   1          1\n    \n    D1 = 32; D2 = 32            \n    "

In [45]:
# ===========================================================================    
# ====== Forward Pass ======================================================

lrelu     = lambda z : np.maximum(z/10, z)
step      = lambda z : np.heaviside(z, 0.5)
dlrelu_dz = lambda z: .1 + (1.-.1)*step(z)

def vanilla_predict(abc, x):
    A, B, C = abc
    
    h0 = x.flatten()
    #
    z1 = C.dot(h0)
    h1 = lrelu(z1)
    #
    z2 = B.dot(h1)
    h2 = lrelu(z2)  # this is our learned featurization
    #
    z3 = A.dot(h2) # linear classifier!
    p = sigmoid(z3)
    #
    return p

In [46]:
# sanity checks

A,B,C = vanilla_init()

# check that linear layer makes sense:
vsa = judge(lambda x : vanilla_predict((+A,B,C), x), x_train, y_train)["acc"]
vsb = judge(lambda x : vanilla_predict((-A,B,C), x), x_train, y_train)["acc"]
assert close_enough(vsa + vsb, 1.)

ffl = judge(lambda x: vanilla_predict((0*A,B,C), x), x_train, y_train)["loss"]
assert close_enough(ffl, np.log(2))

# check end-to-end positivity
x = x_train[0]
y = y_train[0]
A = np.abs(A)
B = np.abs(B)
C = np.abs(C)
acc_ppp = judge(lambda x: vanilla_predict((A,B,C), x), [x], [DIG_B])["acc"]
acc_ppn = judge(lambda x: vanilla_predict((A,B,-C), x), [x], [DIG_B])["acc"]
acc_pnp = judge(lambda x: vanilla_predict((A,-B,C), x), [x], [DIG_B])["acc"]
acc_pnn = judge(lambda x: vanilla_predict((A,-B,-C), x), [x], [DIG_B])["acc"]
assert close_enough(acc_ppp, 1.)
assert close_enough(acc_ppn, 0.)
assert close_enough(acc_pnp, 0.)
assert close_enough(acc_pnn, 1.)

print("hooray!")


hooray!


In [47]:
def vanilla_backprop(abc, x, y):
    A, B, C = abc
    
    h0 = x.flatten()
    #
    z1 = C.dot(h0)
    h1 = lrelu(z1)
    #
    z2 = B.dot(h1)
    h2 = lrelu(z2)  # this is our learned featurization
    #
    z3 = A.dot(h2) # linear classifier!
    p = sigmoid(z3)

    dl_dz3 = p - (1 if y==DIG_B else 0)
    dl_dh2 = dl_dz3 * A
    dl_dz2 = dl_dh2 * dlrelu_dz(z2)
    dl_dh1 = np.matmul(dl_dz2,B)
    dl_dz1 = dl_dh1 * dlrelu_dz(z1)
    # ATTN : figure out what "*" ought to mean

    dl_dA = dl_dz3 * h2
    dl_dB = np.outer(dl_dz2, h1)
    dl_dC = np.outer(dl_dz1, h0)
    
    return (dl_dA, dl_dB, dl_dC)

In [48]:
# sanity check
    
for _ in range(10):
    abc = vanilla_init()
    idx = np.random.randint(N)
    x = x_train[idx]
    y = y_train[idx]

    # do a step of gradient descent, check loss decreased
    before = judge(lambda xx : vanilla_predict(abc, xx), [x], [y])["loss"]
    g = vanilla_backprop(abc, x, y)
    abc = vanilla_displace(abc, -.01, g)
    after = judge(lambda xx: vanilla_predict(abc, xx), [x], [y])["loss"]
    assert after < before

print("hooray!")

hooray!


In [49]:
# ===========================================================================
# ====== CONVOLUTIONAL NEURAL NETWORK =======================================
# ===========================================================================

In [50]:
# ===========================================================================    
# ====== Weight Helpers =====================================================

D0 = SIDE*SIDE
D1 = 32
D2 = 32
D3 = 1

def conv_init():
    A = np.random.randn(    5*5*4) / np.sqrt( 1 + 5*5*4)
    B = np.random.randn(1,1,4,8) / np.sqrt(4 + 1*1*8)
    C = np.random.randn(5,5,8,1) / np.sqrt(8 + 5*5*1)
    return (A,B,C)

def conv_displace(abc, coef, g):
    A, B, C = abc
    gA, gB, gC = g
    return (A + coef * gA,
            B + coef * gB,
            C + coef * gC)

In [51]:
""" architecture

in the chart below, we transform inputs (top) to outputs (bottom)
                height x width x channels
    x               28 x 28 x 1
        avgpool                                        2x2
    h0              14 x 14 x 1       
        conv                         weight C          5x5x8x1      stride 2x2
    z1              10 x 10 x 8
        lrelu
    h1              5  x 5  x 8       
        conv                         weight B          1x1x4x8      stride 1x1
    z2              5  x 5  x 4
        lrelu
    h2              5  x 5  x 4
        dense                        weight A          1x(5*5*4)
    z3                       32
        sigmoid
    p

"""

' architecture\n\nin the chart below, we transform inputs (top) to outputs (bottom)\n                height x width x channels\n    x               28 x 28 x 1\n        avgpool                                        2x2\n    h0              14 x 14 x 1       \n        conv                         weight C          5x5x8x1      stride 2x2\n    z1              10 x 10 x 8\n        lrelu\n    h1              5  x 5  x 8       \n        conv                         weight B          1x1x4x8      stride 1x1\n    z2              5  x 5  x 4\n        lrelu\n    h2              5  x 5  x 4\n        dense                        weight A          1x(5*5*4)\n    z3                       32\n        sigmoid\n    p\n\n'

In [52]:
# ===========================================================================
# ====== Building Blocks ====================================================

def avgpool2x2(x):
    H, W, C = x.shape
    # return an array of shape (H/2 x W/2 x C)
    return (x[0:H:2, 0:W:2] +
            x[0:H:2, 1:W:2] +
            x[1:H:2, 0:W:2] +
            x[1:H:2, 1:W:2] )/4

def conv(x, weights, stride=1):
    H, W, C = x.shape
    KH, KW, OD, ID = weights.shape
    assert C==ID
    HH, WW = int((H-KH+1)/stride), int((W-KW+1)/stride)
    # return an array of shape HH x WW x OD
    return np.array(
        [[
            np.tensordot(
             weights          ,            # KH x KW x OD x ID 
             x[h:h+KH, w:w+KW],            # KH x KW      x ID
             ((0,1,3), (0,1,2))
            )
          for w in range(0,WW*stride,stride)]
         for h in range(0,HH*stride,stride)]
    )


In [53]:
# ===========================================================================
# ====== Forward Pass =======================================================

""" architecture

in the chart below, we transform inputs (top) to outputs (bottom)
                height x width x channels
    x               28 x 28 x 1
        avgpool                                        2x2
    h0              14 x 14 x 1       
        conv                         weight C          5x5x8x1      stride 2x2
    z1              10 x 10 x 8
        lrelu
    h1              5  x 5  x 8       
        conv                         weight B          1x1x4x8      stride 1x1
    z2              5  x 5  x 4
        lrelu
    h2              5  x 5  x 4
        dense                        weight A          1x(5*5*4)
    z3                       32
        sigmoid
    p

"""

def conv_predict(abc, x):
    A, B,C = abc

    h0 = avgpool2x2(x[:,:,np.newaxis])
    #
    z1 = conv(h0, C, stride=2)
    #
    h1 = lrelu(z1)
    #
    z2 = conv(h1, B, stride=1)
    #
    h2 = lrelu(z2)
    #
    z3 = A.dot(h2.flatten())
    #
    p = sigmoid(z3)

    return p

In [54]:
# sanity checks
# scaling and shape tests
aa = np.ones((8,12,7))
pp = 1*np.ones((4,6,7))
assert close_enough(avgpool2x2(aa), pp)

ww = np.ones((3,3,5,7))
cc = (3*3*7)*np.ones((6, 10, 5))
assert close_enough(conv(aa, ww, stride=1), cc)

# orientation test
bb = np.array([1*np.eye(4), 3*np.eye(4)]) # 2 x 4 x 4
"""
    bb == [
        [[1,0,0,0], [0,1,0,0], [0,0,1,0], [0,0,0,1]],
        [[3,0,0,0], [0,3,0,0], [0,0,3,0], [0,0,0,3]],
        ]
"""
pp = np.array([[[1,1,0,0,], [0,0,1,1]]])
assert close_enough(avgpool2x2(bb), pp)
ww = np.zeros((2,2,1,4))
ww[0,0,:,:] = 1 + np.arange(4)
cc = np.array([1,2,3])[np.newaxis, :, np.newaxis] # shape 1 x 3 x 1
assert close_enough(conv(bb, ww, stride=1), cc)

print("hooray!")


hooray!


In [55]:
# ------------ Derivatives ------------------------------------------------

''' Why do we want dconv(x, w)/dw? So and only so that we can compute
    dl/dw from dl/dconv(x,w). We ease our lives by writing a function that directly
    gives dl/dw from dl/dconv(x,w).
'''

def Dw_conv(x, weights_shape, dl_dconv, stride=1):
    H, W, C = x.shape
    KH, KW, OD, ID = weights_shape
    assert C == ID
    HH, WW = int((H-KH+1)/stride), int((W-KW+1)/stride)
    assert dl_dconv.shape == (HH, WW, OD)
    # return an array of shape HH x WW x OD x ID
    HS, WS = HH*stride, WW*stride
    dl_dw = np.array(
                [[np.tensordot(  
                    dl_dconv                            , #HH x WW x OD
                    x[dh:dh + HS:stride,dw:dw+WS:stride], # HH x WW x ID
                    ((0,1), (0,1))
                    )
                    for dw in range(KW)]
                 for dh in range(KH)]
                )
    return dl_dw

''' Why do we want dconv(x, w)/dx? So and only so that we can compute
    dl/dx from dl/dconv(x,w). We ease our lives by writing a function that directly
    gives dl/dx from dl/dconv(x,w).
'''

def Dx_conv(x_shape, weights, dl_dconv, stride):
    H, W, C = x_shape
    KH, KW, OD, ID = weights.shape
    assert C == ID
    HH, WW = int((H-KH+1)/stride), int((W-KW+1)/stride)
    # return H, W, ID
    dl_dx = np.zeros((H,W,ID), dtype=np.float32)
    for h in range(KH):
        for w in range(KW):
            dl_dx[h:h+HH*stride:stride, w:w + WW*stride:stride] += (
                np.tensordot(
                    dl_dconv,           # HHxWWxDD
                    weights[h,w],       # ODxID
                    ((2,),(0,))
                    )
            )
            
    return dl_dx

In [56]:
# ===========================================================================
# ====== Backward Pass ======================================================

def conv_backprop(abc, x, y):
    A, B, C = abc
    
    h0 = avgpool2x2(x[:,:,np.newaxis])
    #
    z1 = conv(h0, C, stride=2)
    #
    h1 = lrelu(z1)
    #
    z2 = conv(h1, B, stride=1)
    #
    h2 = lrelu(z2)
    #
    z3 = A.dot(h2.flatten())
    #
    p = sigmoid(z3)

    dl_dz3 = p - (1 if y==DIG_B else 0)
    dl_dh2 = dl_dz3 * A.reshape(h2.shape)
    dl_dz2 = dl_dh2 * dlrelu_dz(z2)
    dl_dh1 = Dx_conv(h1.shape, B, dl_dz2, stride=1)
    dl_dz1 = dl_dh1 * dlrelu_dz(z1)
    # ATTN : figure out what "*" ought to mean

    dl_dA = dl_dz3 * h2.flatten()
    dl_dB = Dw_conv(h1, B.shape, dl_dz2, stride=1)
    dl_dC = Dw_conv(h0, C.shape, dl_dz1, stride=2)
    
    return (dl_dA, dl_dB, dl_dC)

In [57]:
# ========================================================================
# ======= TRAINING LOOP ==================================================

# ========================================================================
# ======= Training Parameters ==================================================
T = 15001
DT = 1000
LEARNING_RATE = 0.01
ANNEAL_T = 4000
DRAG_COEF = 0.1

idx = 0
def next_training_example():
    global idx, x_train, y_train
    xy = x_train[idx], y_train[idx]
    idx += 1
    if idx==N:
        idx = 0
        x_train, y_train = shuffle(x_train, y_train)
    return xy

#-------- Interface with Model -------------------------------------------



FUNCS_BY_MODEL =    {
        "linear"    :(linear_init, linear_backprop, linear_displace, linear_predict),
        "vanilla"   :(vanilla_init, vanilla_backprop, vanilla_displace, vanilla_predict),
        "conv"      :(conv_init, conv_backprop, conv_displace, conv_predict)
        }



# ========================================================================
# ======= SGD: the Engine of Learning ====================================

for MODEL in ("linear", "vanilla", "conv"):
    print("\n"*4)
    print(MODEL)
    print("\n"*2)
    INIT, BACK, DISP, PRED = FUNCS_BY_MODEL[MODEL]

    w = INIT()
    m = DISP(w, -1., w) # hacky way to set m=0 of same shape as w
    for t in range(T):
        x, y = next_training_example()
        g = BACK(w, x, y)
        LR = LEARNING_RATE * float(ANNEAL_T) / (ANNEAL_T + t)
        m = DISP(m, -DRAG_COEF, m) # m forgets a bit of its past
        m = DISP(m, +1., g) # add gradient to momentum
        w = DISP(w, -LR, m) # update based on momentum

        if t%DT : continue

        xs = x_train[-1000:]
        ys = y_train[-1000:]
        mstr = judge(lambda x: PRED(w, x), xs, ys)
        xs = x_test[-1000:]
        ys = y_test[-1000:]
        mste = judge(lambda x: PRED(w, x), xs, ys)
        print("at step {:6d}".format(t),
            "tr acc {:4.2f} loss {:5.3f}".format(mstr["acc"], mstr["loss"]),
            "te acc {:4.2f} loss {:5.3f}".format(mste["acc"], mste["loss"])
            )
        
    xs = x_train[:]
    ys = y_train[:]
    mstr = judge(lambda x: PRED(w, x), xs, ys, verbose=True)
    xs = x_test[:]
    ys = y_test[:]
    mste = judge(lambda x: PRED(w, x), xs, ys, verbose=True)
    print("after all training",
            "tr acc {:4.2f} loss {:5.3f}".format(mstr["acc"], mstr["loss"]),
            "te acc {:4.2f} loss {:5.3f}".format(mste["acc"], mste["loss"])
            )






linear



at step      0 tr acc 0.49 loss 0.853 te acc 0.49 loss 0.738
at step   1000 tr acc 0.84 loss 0.555 te acc 0.92 loss 0.374
at step   2000 tr acc 0.93 loss 0.201 te acc 0.92 loss 0.433
at step   3000 tr acc 0.95 loss 0.128 te acc 0.93 loss 0.285
at step   4000 tr acc 0.98 loss 0.046 te acc 0.93 loss 0.334
at step   5000 tr acc 1.00 loss 0.017 te acc 0.93 loss 0.330
at step   6000 tr acc 1.00 loss 0.014 te acc 0.93 loss 0.326
at step   7000 tr acc 1.00 loss 0.009 te acc 0.93 loss 0.328
at step   8000 tr acc 1.00 loss 0.008 te acc 0.93 loss 0.332
at step   9000 tr acc 1.00 loss 0.008 te acc 0.93 loss 0.332
at step  10000 tr acc 1.00 loss 0.008 te acc 0.93 loss 0.330
at step  11000 tr acc 1.00 loss 0.007 te acc 0.93 loss 0.334
at step  12000 tr acc 1.00 loss 0.007 te acc 0.93 loss 0.332
at step  13000 tr acc 1.00 loss 0.007 te acc 0.93 loss 0.331
at step  14000 tr acc 1.00 loss 0.007 te acc 0.93 loss 0.333
at step  15000 tr acc 1.00 loss 0.007 te acc 0.93 loss 0.334


100%|██████████| 1000/1000 [00:00<00:00, 178132.34it/s]
100%|██████████| 1991/1991 [00:00<00:00, 123529.77it/s]

after all training tr acc 1.00 loss 0.007 te acc 0.93 loss 0.336





vanilla



at step      0 tr acc 0.50 loss 0.697 te acc 0.51 loss 0.694





at step   1000 tr acc 0.65 loss 0.685 te acc 0.62 loss 0.679
at step   2000 tr acc 0.65 loss 0.595 te acc 0.87 loss 0.498
at step   3000 tr acc 0.91 loss 0.276 te acc 0.89 loss 0.302
at step   4000 tr acc 0.93 loss 0.213 te acc 0.90 loss 0.245
at step   5000 tr acc 0.95 loss 0.165 te acc 0.90 loss 0.299
at step   6000 tr acc 0.96 loss 0.120 te acc 0.90 loss 0.324
at step   7000 tr acc 0.99 loss 0.056 te acc 0.93 loss 0.220
at step   8000 tr acc 1.00 loss 0.028 te acc 0.92 loss 0.276
at step   9000 tr acc 0.99 loss 0.045 te acc 0.91 loss 0.339
at step  10000 tr acc 0.99 loss 0.014 te acc 0.92 loss 0.361
at step  11000 tr acc 1.00 loss 0.008 te acc 0.92 loss 0.370
at step  12000 tr acc 1.00 loss 0.004 te acc 0.92 loss 0.398
at step  13000 tr acc 1.00 loss 0.003 te acc 0.92 loss 0.399
at step  14000 tr acc 1.00 loss 0.003 te acc 0.92 loss 0.410
at step  15000 tr acc 1.00 loss 0.002 te acc 0.92 loss 0.420


100%|██████████| 1000/1000 [00:00<00:00, 24461.29it/s]
100%|██████████| 1991/1991 [00:00<00:00, 22160.40it/s]

after all training tr acc 1.00 loss 0.002 te acc 0.92 loss 0.400





conv








at step      0 tr acc 0.50 loss 0.693 te acc 0.51 loss 0.690
at step   1000 tr acc 0.74 loss 0.557 te acc 0.79 loss 0.410
at step   2000 tr acc 0.78 loss 0.458 te acc 0.91 loss 0.224
at step   3000 tr acc 0.76 loss 0.491 te acc 0.93 loss 0.215
at step   4000 tr acc 0.79 loss 0.442 te acc 0.94 loss 0.185
at step   5000 tr acc 0.79 loss 0.462 te acc 0.93 loss 0.224
at step   6000 tr acc 0.78 loss 0.492 te acc 0.92 loss 0.214
at step   7000 tr acc 0.81 loss 0.430 te acc 0.93 loss 0.180
at step   8000 tr acc 0.80 loss 0.429 te acc 0.94 loss 0.160
at step   9000 tr acc 0.80 loss 0.418 te acc 0.94 loss 0.156
at step  10000 tr acc 0.82 loss 0.405 te acc 0.94 loss 0.156
at step  11000 tr acc 0.81 loss 0.401 te acc 0.95 loss 0.151
at step  12000 tr acc 0.81 loss 0.401 te acc 0.95 loss 0.152
at step  13000 tr acc 0.81 loss 0.400 te acc 0.94 loss 0.156
at step  14000 tr acc 0.83 loss 0.393 te acc 0.94 loss 0.155
at step  15000 tr acc 0.83 loss 0.382 te acc 0.95 loss 0.151


100%|██████████| 1000/1000 [00:00<00:00, 1481.73it/s]
100%|██████████| 1991/1991 [00:01<00:00, 1301.57it/s]

after all training tr acc 0.83 loss 0.382 te acc 0.95 loss 0.150





### TEST DATA PERFORMANCE


#### N ~ 12000

Linear
acc: 0.97      loss: 0.93   

Vanilla
acc: 0.98      loss: 0.047

Convolution
acc: 0.98      loss: 0.045


#### N = 1000 with noise

Linear
acc: 0.93      loss: 0.336   

Vanilla
acc: 0.92      loss: 0.400

Convolution
acc: 0.95      loss: 0.150

--------------------------------------------------------
### CONCLUSION:
conv nets better generalize on this image domain