In [7]:
# Standard imports
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import network as Network
import copy

# Shallow vs. Deep Networks

In [8]:
# 5D -> 5D (line mapping)
n_input = 5
n_output = 5

# Ill-conditioned matrix
# Generate an ill-cond matrix, and map random outputs to inputs.
# Then the task is to learn to inverst the matrix.
N = n_input
#np.random.seed(317)
L = np.random.normal(size=[N,N])
U, S, V = np.linalg.svd(L)
S[-2] = 0.01
S[-1] = 0.001
Ltrue = np.dot(U, np.dot(np.diag(S), V.T))

# Create a training dataset
n_samples = 50
training_output = []
for idx in range(n_samples):
    t = np.random.rand(n_output)*2 - 1.
    training_output.append(t)
    
training_input = []
for idx in range(n_samples):
    x = np.dot(Ltrue, training_output[idx]) + np.random.normal(size=n_input, scale=0.1)
    training_input.append(x)

# Create a testing dataset
n_samples = 50
test_output = []
for idx in range(n_samples):
    t = np.random.rand(n_input)*2 - 1.
    test_output.append(t)
    
test_input = []
for idx in range(n_samples):
    x = np.dot(Ltrue, test_output[idx]) + np.random.normal(size=[n_input], scale=0.1)
    test_input.append(x)

train = [training_input, training_output]
test = [test_input, test_output]
#perfect = [perfect_input, perfect_output]

In [9]:
# Two networks with the same number of hidden nodes
shallow = Network.Network([5, 21, 5], type='regression')
deep = Network.Network([5, 7, 7, 7, 5], type='regression')

In [10]:
print('Shallow: '+str(shallow.Evaluate(test)))
print('   Deep: '+str(deep.Evaluate(test)))

Shallow: 1.3380822208247523
   Deep: 1.8390045077350248


In [11]:
prog = shallow.Learn(train, epochs=500)
prog = deep.Learn(train, epochs=500)

A Jupyter Widget

A Jupyter Widget

In [12]:
print('Shallow: '+str(shallow.Evaluate(test)))
print('   Deep: '+str(deep.Evaluate(test)))

Shallow: 0.9745982115820045
   Deep: 0.3275295075990131


In [13]:
print('Shallow: '+str(shallow.Evaluate(train)))
print('   Deep: '+str(deep.Evaluate(train)))

Shallow: 0.13198921383461448
   Deep: 0.3101549689131738


### Let's try a REALLY deep network!

In [14]:
really_deep = Network.Network([5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5], type='regression')

In [15]:
rdeep = really_deep.Learn(train, epochs=500)

A Jupyter Widget

In [16]:
really_deep.Evaluate(test)

0.8553537144378951

In [17]:
really_deep.Evaluate(train)

0.8422603741189725

# Vanishing Gradients

In [18]:
# 1D -> 1D (line mapping)
n_input = 1
n_output = 1

def myfunc(x):
    return 0.4*x-0.9

# Create a training dataset
n_samples = 50
training_output = []
training_input = []
xv = np.linspace(-1, 1, n_samples)
for idx in range(n_samples):
    #x = np.random.rand()*2. - 1.
    x = xv[idx]
    t = myfunc(x) + np.random.normal(scale=0.1)
    training_input.append(np.array([x]))
    training_output.append(np.array([t]))

# Create a testing dataset
n_samples = 50
test_input = []
test_output = []
for idx in range(n_samples):
    #x = np.random.rand()*2. - 1.
    x = xv[idx] + np.random.normal(scale=0.1)
    t = myfunc(x) + np.random.normal(scale=0.1)
    test_input.append(np.array([x]))
    test_output.append(np.array([t]))

# Create a perfect dataset
n_samples = 50
perfect_input = []
perfect_output = []
for idx in range(n_samples):
    #x = np.random.rand()*2. - 1.
    x = test_input[idx]
    t = myfunc(x[0])
    perfect_input.append(np.array(x))
    perfect_output.append(np.array(t))

train = [training_input, training_output]
test = [test_input, test_output]
perfect = [perfect_input, perfect_output]

In [19]:
# Compute gradients using backprop
def Backprop(net, t):
    np.set_printoptions(precision=3)
    print(str(net.n_layers)+' layers')
    dEdz = net.h[-1] - t
    print('layer '+str(net.n_layers-1)+' norm='+str(np.round(np.linalg.norm(dEdz),decimals=4))+' '+str(dEdz))
    for l in range(net.n_layers-2, 0, -1):
        dEdz = (net.h[l]*(1.-net.h[l])*np.dot(net.W[l].T, dEdz)).flatten()
        print('layer '+str(l)+' norm='+str(np.round(np.linalg.norm(dEdz),decimals=4))+' '+str(dEdz))

In [20]:
net = Network.Network([1, 4, 4, 4, 4, 1], type='regression')

In [21]:
p = np.random.randint(len(training_input))
net.FeedForward(training_input[p])

array([-0.54504233])

In [22]:
Backprop(net, training_output[p])

6 layers
layer 5 norm=0.551 [0.551]
layer 4 norm=0.1082 [-0.025 -0.084  0.002 -0.063]
layer 3 norm=0.0274 [ 0.026 -0.    -0.005 -0.008]
layer 2 norm=0.0114 [ 0.01   0.003  0.001 -0.005]
layer 1 norm=0.0035 [-0.003  0.002  0.    -0.   ]


In [23]:
net.W

[array([[ 0.148],
        [-2.288],
        [ 0.971],
        [ 0.633]]), array([[-1.009,  0.601, -0.168, -0.088],
        [-0.142, -0.378,  0.114,  0.303],
        [ 0.003,  0.306,  0.27 ,  0.074],
        [ 0.541, -0.575, -0.433,  0.185]]), array([[ 1.368,  0.522,  0.373, -0.928],
        [ 0.258,  0.129, -1.   ,  0.137],
        [ 0.464, -0.37 , -0.418,  0.018],
        [-0.979,  0.206,  0.999,  0.198]]), array([[ 0.23 , -0.364,  0.096, -0.345],
        [-0.829,  0.165,  0.397,  0.488],
        [ 0.159, -0.129,  0.514,  0.399],
        [-0.634, -0.067, -0.243, -0.012]]), array([[-0.194, -0.769,  0.017, -0.557]])]

# Exploding Gradients

In [24]:
net2 = Network.Network([1, 1, 1, 1, 1, 1, 1], type='regression')
for w in net2.W:
    w[:] = 8.
for b in net2.b:
    b[:] = -4.

In [25]:
net2.FeedForward(np.array([0.5]))

array([0.])

In [26]:
Backprop(net2, np.array([0.5]))

7 layers
layer 6 norm=0.5 [-0.5]
layer 5 norm=1.0 [-1.]
layer 4 norm=2.0 [-2.]
layer 3 norm=4.0 [-4.]
layer 2 norm=8.0 [-8.]
layer 1 norm=16.0 [-16.]
