In [51]:
import numpy as np
import scipy as sp
from sklearn import datasets
import mxnet as mx
from matplotlib import pyplot as plt
from torch import functional as F
mnist = mx.test_utils.get_mnist()

In [52]:
x = mnist['train_data']
x = x.reshape(x.shape[0], -1)
y = mnist['train_label']

In [53]:
def plot_loss(y):
    plt.plot(y)
    plt.show()

# Simple NN

In [46]:
# Weights must be of the format (in, hidden)
shapes = [784, 10, 10]

### AFFINE ###
def l1(w):
    return np.mean(np.absolute(w))

def b(w):
    n = w.size
    return l1(w) / n * np.sign(w)

def b_affine(x, w):
    """
    x : (n, m)
    w : (m, k)
    out : (n, k)
    """
    return x.dot(b(w))
##########

def affine(x, w):
    """
    x : (n, m)
    w : (m, k)
    out : (n, k)
    """
    return x.dot(w)

def affine_(d, x, w):
    """
    x : (n, m)
    w : (m, k)
    d : (n, k)
    ---
    dw : (m, k)
    dx : (n, m)
    """
    return {'x': d.dot(w.T), 'w': x.T.dot(d)}

def sigmoid(x):
    """
    x : (n, m)
    out : (n, m)
    """
    return sp.special.expit(x)

def sigmoid_(d, x):
    """
    x : (n, m)
    d : (n, m)
    ---
    dx : (n, m)
    """
    sigm = sp.special.expit(x)
    return d * (sigm * (1-sigm))

def softmax_ce(x, y):
    """
    x : (n, m)
    y : (n,)
    out : () [scalar]
    
    Equation is 1/n * \sum_i^n [ -log(e^x_{y_i} / \sum_j e^x_j) ]
    which is equivalently:
        
        1/n * \sum_i^n log(\sum_j e^x_j) - x_{y_i}
    """
    n = x.shape[0]
    exp = np.exp(x)
    # denominator in original expression, after log
    denom = np.log(np.sum(exp, axis=1))
    return np.sum(denom - x[np.arange(n), y]) / n

def softmax_ce_(x, y):
    """
    x : (n, m)
    y : (n,)
    ---
    dx : (n, m)
    
    Back propagation for a single data point is:
    
    dL_i/dx_{ik} = -1_{k == y_i} + softmax(x_i)_k
    
    thus, for all data points:
    dL/dx_k = 1/n * \sum_i -1_{k == y_i} + softmax(x_i)_k
    """
    n = x.shape[0]
    exp = np.exp(x)
    softmax = exp / np.expand_dims(np.sum(exp, axis=1), 1)
    softmax[np.arange(n), y] -= 1
    return softmax / n

  """
  """


In [50]:
iterations = 100
ALPHA = 1e4
W = [np.random.randn(shapes[i], shapes[i+1]) for i in range(len(shapes)-1)]

def b_affine_(d, x, w):
    """
    x : (n, m)
    w : (m, k)
    d : (n, k)
    ---
    dw : (m, k)
    dx : (n, m)
    WARNING: Untested!
    """
    # dw_ is binarized w's gradients
    # dw is real gradients
    dw_ = x.T.dot(d)
    signw = np.sign(w)
    n = w.size
    dw = dw_ # * l1(w) / n * (-1 <= w) * (w <= 1)
    print(np.linalg.norm(w))
    # dw = dw_
    # print(dw)
    # Multiplication rule: \sum_j d[l1(w)/n]/dw_i * sign(w_j)
    # dw += np.sum(dw_ * signw) / n * signw
    # dw += 1/n * dw_
    return {'x': d.dot(b(w).T), 'w': dw}

for i in range(iterations):
    # Forward pass
    o1 = b_affine(x, W[0])
    # o2 = sigmoid(o1)
    # o3 = b_affine(o2, W[1])
    loss = softmax_ce(o1, y) #change to o3 later
    print("loss at {} : {}".format(i, loss))
    # Backward pass
    d = softmax_ce_(o1, y)
    # d = b_affine_(d, o2, W[1])
    # dw1 = d['w']
    # d = sigmoid_(d['x'], o1)
    d = b_affine_(d, x, W[0])
    dw0 = d['w']
    W[0] -= dw0 * ALPHA
    # W[1] -= dw1 * ALPHA

loss at 0 : 2.3028296174658704
87.5858610673968
loss at 1 : 1.916419321427279
10407.399019983444
loss at 2 : 1.6468456915070915
18929.48025128426
loss at 3 : 1.453807945919955
26581.63486333422
loss at 4 : 1.310009563649874
33129.38398732028
loss at 5 : 1.2127134149039926
38991.216151467575
loss at 6 : 1.135768281960623
44103.73122633189
loss at 7 : 1.1090247387896608
48830.474984468616
loss at 8 : 1.025589298552166
53071.221306067746
loss at 9 : 1.0000645950421654
56922.30346321452
loss at 10 : 0.9523862182930388
60591.8810596474
loss at 11 : 0.9598377364703511
63925.945377298835
loss at 12 : 0.8910327144828444
67114.44689900786
loss at 13 : 0.8991222973771152
70006.88253109856
loss at 14 : 0.8186131250486954
72811.67277321359
loss at 15 : 0.8060569907099839
75284.05289059832
loss at 16 : 0.7761361948084753
77823.21443224521
loss at 17 : 0.7722921429432621
80088.08219371381
loss at 18 : 0.7368274002080816
82343.63229681825
loss at 19 : 0.7179419339742665
84328.51133317953
loss at 20 :

In [39]:
print(np.mean(np.argmax(o1, axis=1) == y))

0.8580833333333333


# Simple binary NN

In [None]:
# Weights must be of the format (in, hidden)
shapes = [3, 1, 3]
W = [np.random.randn(shapes[i], shapes[i+1]) for i in range(len(shapes)-1)]
alpha = [np.mean(np.absolute(w)) for w in W]
bW = [np.sign(w).astype(np.int8) for w in W]

In [None]:
x = np.array([[-1,2,3]])
print(W[0])

In [None]:
print(W[0] * np.sign(W[0]))

In [None]:
print(alpha[0] * bW[0])

In [None]:
print(b_affine(x, W[0]))

In [None]:
d = b_affine_(np.ones((1,1)), x, W[0])

In [None]:
print(W[0], d['w'])

In [None]:
w0 = W[0] + d['w']
print(w0)

In [None]:
print(b_affine(x, w0))

In [None]:
affine_(np.ones((1,1)), x, W[0])

In [None]:
b_affine_(np.ones((1,1)), x, W[0])