In [None]:
from jax import numpy as jnp, random, jit, grad, pmap, vmap, tree_map, local_device_count, devices
from jax.tools.colab_tpu import setup_tpu
from jax.lax import fori_loop, while_loop, switch, cond, pmean
from jax._src.lax.slicing import dynamic_slice

import numpy as np
import matplotlib.pyplot as plt
from traitlets.config.configurable import deepcopy
from collections import namedtuple
from typing import NamedTuple
from functools import partial
import csv

In [None]:
setup_tpu()
n_devices = local_device_count()

In [None]:
seed = 0
key = random.PRNGKey(seed)

In [None]:
# CONST
ALPHA = .3

# activation functions and activation derivatives
SIGMOID = 0
RELU = 1
ELU = 2
LRELU = 3
TANH = 4

# sigmoid
@jit
def sigmoid(x):
  return 1 / (1 + jnp.exp(-x))

@jit
def d_sigmoid(x):
  f_x = sigmoid(x)
  return f_x * (1 - f_x)

# ReLU
@jit
def relu(x):
  return (x > 0) * x
@jit
def d_relu(x):
  return (x > 0) * 1.0

# ELU
@jit
def elu(x):
  return ((x > 0) * x) + ((x <= 0) * (jnp.exp(x) - 1) * ALPHA)

@jit
def d_elu(x):
  return ((x > 0) * 1.0) + ((x <= 0) * (elu(x) + ALPHA))

# Leaky ReLU which at ALPHA = 1 becomes the identity function
@jit
def lrelu(x):
  return ((x > 0) * x) + ((x <= 0) * x * ALPHA)

@jit
def d_lrelu(x):
  return ((x > 0) * 1.0) + ((x <= 0) * ALPHA)

# hyperbolic tan (clamped to the range [-10, 10] otherwise errors)
@jit
def tanh(x):
  x = ((x > 10) * 10) + (((x >= -10) & (x <= 10)) * x) + ((x < -10) * -10)
  e_pos = jnp.exp(x)
  e_neg = jnp.exp(-x)
  return (e_pos - e_neg) / (e_pos + e_neg)

@jit
def d_tanh(x):
  return 1 - tanh(x)**2

# Generic choice function
@jit
def activation(x, func):
  return switch(func, [sigmoid, relu, elu, lrelu, tanh], x)
@jit
def d_activation(x, func):
  return switch(func, [d_sigmoid, d_relu, d_elu, d_lrelu, d_tanh], x)
        

In [None]:
# cost functions and cost derivatives
MEAN_SQUARED = 0
MEAN_ABSOLUTE = 1
MEAN_PERCENT = 2
# MEAN_SQ_LOG = 3

# mean squared
@jit
def mean_squared(actual_values, expected_values):
  return (actual_values - expected_values)**2

@jit
def d_mean_squared(actual_values, expected_values):
  return 2 * (actual_values - expected_values)

# mean absolute
@jit
def mean_absolute(actual_values, expected_values):
  return jnp.abs(actual_values - expected_values)

@jit
def d_mean_absolute(actual_values, expected_values):
  x = actual_values - expected_values
  return (x > 0) * 1.0 + (x < 0) * -1.0

# mean percent
@jit 
def mean_percent(actual_values, expected_values):
  return (mean_absolute(actual_values, expected_values) / expected_values) * 100

def d_mean_percent(actual_values, expected_values):
  return 100 * d_mean_absolute(actual_values, expected_values)

# @ jit
# def mean_sq_log(actual_values, expected_values):
#   return mean_squared(jnp.log(actual_values), jnp.log(expected_values))

# @ jit
# def d_mean_sq_log(actual_values, expected_values):
#   return (2 / actual_values) * mean_squared(jnp.log(actual_values), jnp.log(expected_values))

# Generic choice function
@jit
def error(actual_values, expected_values, func):
  return switch(func, [mean_squared, mean_absolute, mean_percent], actual_values, expected_values)
@jit
def d_error(actual_values, expected_values, func):
  return switch(func, [d_mean_squared, d_mean_absolute, d_mean_percent], actual_values, expected_values)

In [None]:
# layer creation functions
Layer = namedtuple('Layer', 'weights biases activation error')

# class Layer(NamedTuple):
#   weights: jnp.array
#   biases: jnp.array
#   activation: int
#   error: int

# make layer
# should return -> tuple containging a array(nodes in, nodes out) of weights and a array(nodes out) of biases
@partial(jit, static_argnums=(1,))
def make_layer(key, shape, activation, error):
  key, subkey = random.split(key)
  return key, Layer(random.uniform(key = subkey, minval = -10., maxval = 10., shape = (shape[0], shape[1])), jnp.zeros(shape = shape[1]), activation, error)

In [None]:
# layer computation functions

# def __call__(self, inputs):
#         self._inputs = inputs
#         for out_node in range(self._nodes_out):
#             self._z_vals[out_node] = self._biases[out_node] + np.dot(inputs, self._weights[..., out_node])
#             self._activation_vals[out_node] = self._activation.f(self._z_vals[out_node])
#         return self._activation_vals
@jit
def calc_layer_output(layer, inputs):
  z_vals = layer.biases + jnp.matmul(inputs, layer.weights)
  outputs = activation(z_vals, layer.activation)
  return z_vals, outputs

# def _calc_output_node_vals(self, expected_output):
#         node_values = np.zeros(shape = self._nodes_out)
#         for out_node in range(self._nodes_out):
#             cost_derivative = self._error.d_f(self._activation_vals[out_node], expected_output[out_node])
#             activation_val_derivative = self._activation.d_f(self._z_vals[out_node])
#             node_values[out_node] = cost_derivative * activation_val_derivative
#         return node_values
@jit
def calc_output_node_vals(layer, z_vals, actual_output, expected_output):
  return d_activation(z_vals, layer.activation) * d_error(actual_output, expected_output, layer.error)

# def _calc_hidden_node_vals(self, old_layer, old_node_vals):
#         new_node_vals = np.zeros(shape = self._nodes_out)
#         for new_node in range(new_node_vals.shape[0]):
#             new_node_val = 0
#             for old_node in range(old_node_vals.shape[0]):
#                 weighted_derivative = old_layer._weights[new_node, old_node]
#                 new_node_val += weighted_derivative * old_node_vals[old_node]
#             new_node_val *= self._activation.d_f(self._z_vals[new_node])
#             new_node_vals[new_node] = new_node_val
#         return new_node_vals
@jit
def calc_hidden_node_vals(layer, z_vals, node_vals, weights):
  return d_activation(z_vals, layer.activation) * jnp.matmul(node_vals, jnp.transpose(weights))

# def _update_gradients(self, node_values):
#         for out_node in range(self._nodes_out):
#             for in_node in range(self._nodes_in):
#                 cost_weight_derivative = self._inputs[in_node] * node_values[out_node]
#                 self._gradient_w[in_node, out_node] += cost_weight_derivative
#             self._gradient_b[out_node] += node_values[out_node]
@jit
def update_gradient(gradient_w, gradient_b, inputs, node_values):
  return gradient_w + jnp.matmul(inputs[:, None], node_values[None, :]), gradient_b + node_values

# def _apply_gradients(self, learning_rate):
#     self._biases -= (learning_rate * self._gradient_b)
#     self._weights -= (learning_rate * self._gradient_w)
@jit
def apply_gradients(layer, gradient_w, gradient_b, learning_rate):
  return Layer(layer.weights - (learning_rate * gradient_w), layer.biases - (learning_rate * gradient_b), layer.activation, layer.error)

# def _clear_gradients(self):
#     self._gradient_w = np.zeros(shape = (self._nodes_in, self._nodes_out))
#     self._gradient_b = np.zeros(shape = self._nodes_out)
@partial(jit, static_argnums=(0,))
def reset_gradients(shape):
  return jnp.zeros(shape = (shape[0], shape[1])), jnp.zeros(shape = shape[1])

In [None]:
# neural network creation functions
@partial(jit, static_argnums=(1,))
def make_network(key, shape: tuple, activations: tuple, error: tuple):
  layers = []
  for i in range(len(shape) - 1):
    key, layer = make_layer(key, shape = (shape[i], shape[i + 1]), activation = activations[i], error = error[i])
    layers.append(layer)
  return key, layers

# def make_layer_list(i, val):
#   key, shape, layers = val
#   print(shape)
#   key, layer = make_layer(key, shape = jnp.array([shape[i], shape[i + 1]]))
#   layers.append(layer)
#   return (key, shape, layers)
  
# # @partial(jit, static_argnums=(1,))
# def make_network(key, shape):
#   print(shape)
#   result = fori_loop(0, len(shape) - 1, make_layer_list, (key, shape, []))
#   return result[0], result[2]

In [None]:
# neural network computation functions

# def _calc_outputs(self, inputs):
#         for layer in self._layers:
#             inputs = layer(inputs)
#         self._outputs = inputs
def calc_network_output(network, inputs, learning = False):
  z_vals_list = []
  input_list = []
  for layer in network:
    input_list.append(inputs)
    z_vals, inputs = calc_layer_output(layer, inputs)
    z_vals_list.append(z_vals)
  if learning:
    return input_list, z_vals_list, inputs
  return inputs

@jit
def classify(network, inputs):
  return jnp.argmax(calc_network_output(network, inputs))

# @ jit
# def apply_prop(i, val):
#   network, inputs, input_list, z_vals_list = val
#   input_list.append(inputs)
#   z_vals, inputs = calc_layer_output(network[i], inputs)
#   z_vals_list.append(z_vals)
#   return (network, inputs, input_list, z_vals_list)

# @partial(jit, static_argnums = (2,))
# def calc_network_output(network, inputs, learning = False):
#   result = fori_loop(0, len(network), apply_prop, (network, inputs, [], []))
#   if learning:
#     return result[2], result[3], result[1]
#   return result[1]

# def _cost(self, data_point):
#         self._calc_outputs(data_point['input'])
#         output_layer = self._layers[len(self._layers) - 1]
#         cost = 0.
#         for out_node in range(self._outputs.shape[0]):
#             cost += output_layer._error.f(self._outputs[out_node], data_point['expected_output'][out_node])
#         return cost
def cost(i, val):
  total_cost, network, inputs, expected_outputs = val
  return (total_cost + jnp.sum(error(calc_network_output(network, inputs[i]), expected_outputs[i], network[len(network) - 1].error)), network, inputs, expected_outputs)

# def _avg_cost(self, data_points):
#     total_cost = 0.
#     for data_point in data_points:
#         total_cost += self._cost(data_point)
#     return total_cost / len(data_points)
def avg_cost(network, inputs, expected_outputs):
  total_cost, network, inputs, expected_outputs = fori_loop(0, len(inputs), cost, (0, network, inputs, expected_outputs))
  return total_cost / len(inputs)

def test(i, val):
  total_correct, network, inputs, expected_outputs = val
  total_correct += ((jnp.argmax(expected_outputs[i]) == jnp.argmax(calc_network_output(network, inputs[i]))) * 1)
  return (total_correct, network, inputs, expected_outputs)

def accuracy(network, inputs, expected_outputs):
  total_correct, network, inputs, expected_output = fori_loop(0, len(inputs), test, (0, network, inputs, expected_outputs))
  return total_correct / len(inputs)

# def _apply_gradients(self, learning_rate):
#       for layer in self._layers:
#           layer._apply_gradients(learning_rate)
def apply_all_gradients(network, gradient_w_list, gradient_b_list, learning_rate):
  for layer in range(len(network)):
    network[layer] = apply_gradients(network[layer], gradient_w_list[layer], gradient_b_list[layer], learning_rate)
  return network

# @ jit
# def apply_per_layer(i, val):
#   network, gradient_w_list, gradient_b_list, learning_rate = val
#   network[i] = apply_gradients(network[i], gradient_w_list[i], gradient_b_list[i], learning_rate)
#   return (network, gradient_w_list, gradient_b_list, learning_rate)

# @ jit
# def apply_all_gradients(network, gradient_w_list, gradient_b_list, learning_rate):
#   result = fori_loop(0, len(network), apply_per_layer, (network, gradient_w_list, gradient_b_list, learning_rate))
#   return result[0]

# def _clear_gradients(self) :
#     for layer in self._layers:
#         layer._clear_gradients()
def reset_gradient_list(network):
  gradient_w_list = []
  gradient_b_list = []
  for layer in network:
    gradient_w, gradient_b = reset_gradients(layer.weights.shape)
    gradient_w_list.append(gradient_w)
    gradient_b_list.append(gradient_b)
  return gradient_w_list, gradient_b_list

# @ jit
# def create_gradient_list(i, val):
#   network, gradient_w_list, gradient_b_list = val
#   gradient_w, gradient_b = reset_gradients(network[i])
#   gradient_w_list.append(gradient_w)
#   gradient_b_list.append(gradient_b)
#   return (network, gradient_w_list, gradient_b_list)

# @ jit
# def reset_gradient_list(network):
#   result = fori_loop(0, len(network), create_gradient_list, (network, [], []))
#   return result[1], result[2] 

# def _back_prop(self, data_point):
#     self._calc_outputs(data_point['input'])
#     output_layer = self._layers[len(self._layers) - 1]
#     node_values = output_layer._calc_output_node_vals(data_point['expected_output'])
#     output_layer._update_gradients(node_values)
#     for layer in range(len(self._layers) - 2, -1, -1):
#         hidden_layer = self._layers[layer]
#         node_values = hidden_layer._calc_hidden_node_vals(self._layers[layer + 1], node_values)
#         hidden_layer._update_gradients(node_values)
def back_prop(network, gradient_w_list, gradient_b_list, inputs, expected_outputs):
  input_list, z_vals_list, outputs = calc_network_output(network, inputs, learning = True)
  node_values = calc_output_node_vals(network[len(network) - 1], z_vals_list[len(network) - 1], outputs, expected_outputs)
  gradient_w_list[len(network) - 1], gradient_b_list[len(network) - 1] = update_gradient(gradient_w_list[len(network) - 1], gradient_b_list[len(network) - 1], input_list[len(network) - 1], node_values)
  for layer in range(len(network) - 2, -1, -1):
    node_values = calc_hidden_node_vals(network[layer], z_vals_list[layer], node_values, network[layer + 1].weights)
    gradient_w_list[layer], gradient_b_list[layer] = update_gradient(gradient_w_list[layer], gradient_b_list[layer], input_list[layer], node_values)
  return gradient_w_list, gradient_b_list

# def update_all_gradients(i, val):
#   input_list, z_vals_list, outputs, network, gradient_w_list, gradient_b_list = val
#   node_values = cond(i == 1, calc_output_node_vals, calc_hidden_node_vals, network[len(network) - i], z_vals_list[len(network) - i])

# @ jit
# def back_prop(network, gradient_w_list, gradient_b_list, inputs, expected_outputs):
#   input_list, z_vals_list, outputs = calc_network_output(network, inputs, learning = True)
#   _, _, _, _, gradient_w_list, gradient_b_list = fori_loop(1, len(network) + 1, update_all_gradients, (input_list, z_vals_list, outputs, network, gradient_w_list, gradient_b_list))
#   return gradient_w_list, gradient_b_list

# def learn(self, batch, learning_rate):
#         avg_vals = []
#         for epoch in range(self._EPOCH):
#             for pos in range(0, len(batch), self._BATCH_SIZE):
#                 mini_batch = batch[pos : np.minimum(pos + self._BATCH_SIZE, len(batch))]
#                 for data_point in mini_batch:
#                     self._back_prop(data_point)
#                 self._apply_gradients(learning_rate / np.minimum(self._BATCH_SIZE, len(mini_batch)))
#                 self._clear_gradients()
#                 # avg_vals.append(self._avg_cost(mini_batch))
#             # self._apply_gradients(learning_rate / len(batch))
#             # self._clear_gradients()
#             avg_vals.append(self._avg_cost(batch))
#         return avg_vals
def apply_back_prop(i, val):
  network, mini_batch, expected_outputs, gradient_w_list, gradient_b_list = val
  gradient_w_list, gradient_b_list = back_prop(network, gradient_w_list, gradient_b_list, mini_batch[i], expected_outputs[i])
  return (network, mini_batch, expected_outputs, gradient_w_list, gradient_b_list)

def learn(network, mini_batch, mini_expected, learning_rate):
  gradient_w_list, gradient_b_list = reset_gradient_list(network)
  network, mini_batch,  mini_expected, gradient_w_list, gradient_b_list = fori_loop(0, len(mini_batch), apply_back_prop, (network, mini_batch,  mini_expected, gradient_w_list, gradient_b_list))
  return apply_all_gradients(network, gradient_w_list, gradient_b_list, learning_rate / len(mini_batch))

def train(network, batch, expected_outputs, # training information
          learning_rate = .001, epochs = 10, batch_size = 1000, # meta parameters
          train_percent = 1, set_aside_training_data = True, test_batch = None, test_outputs = None): # testing information
  all_loss = []
  all_acc = []

  if set_aside_training_data:
    train_batch = batch[0 : int(jnp.floor(len(batch) * train_percent))]
    train_outputs = expected_outputs[0 : int(jnp.floor(len(expected_outputs) * train_percent))]
    test_batch = batch[int(jnp.floor(len(batch) * train_percent)) : len(batch)]
    test_outputs = expected_outputs[int(jnp.floor(len(expected_outputs) * train_percent)) : len(expected_outputs)]
  elif test_batch == None or test_outputs == None:
    print("No testing data, but no training data was set aside either (if you want no data to be set aside, use train_percent = 1 instead).")
    return all_loss, all_acc, network
  else:
    train_batch = batch
    train_outputs = expected_outputs


  for epoch in range(epochs):
    for pos in range(0, len(train_batch), batch_size):
      mini_batch = train_batch[pos : jnp.minimum(pos + batch_size, len(train_batch))]
      mini_outputs = train_outputs[pos : jnp.minimum(pos + batch_size, len(train_batch))]

      network = learn(network, mini_batch, mini_outputs, learning_rate)

      all_loss.append(avg_cost(network, mini_batch, mini_outputs))
      # all_acc.append(test(network, mini_batch, mini_expected))
    # all_avg.append(avg_cost(network, batch, expected_outputs))
    all_acc.append(accuracy(network, test_batch, test_outputs))
  return all_acc, all_loss, network


In [None]:
def deconstruct(network):
  weights = []
  biases = []
  activations = []
  errors = []

  for layer in network:
    weights.append(layer.weights)
    biases.append(layer.biases)
    activations.append(layer.activation)
    errors.append(layer.error)

  return jnp.array(weights), jnp.array(biases), jnp.array(activations), jnp.array(errors)

def replicate(network, count):
  weights, biases, activations, errors = deconstruct(network)
  return Layer(tree_map(lambda x: jnp.array([x] * count), weights), tree_map(lambda x: jnp.array([x] * count), biases), tree_map(lambda x: jnp.array([x] * count), activations), tree_map(lambda x: jnp.array([x] * count), errors))

def split(arr, count):
  return arr.reshape(count, arr.shape[0] // count, *arr.shape[1:])

In [None]:
# parallelized training functions
@partial(pmap, axis_name="mini_batch")
def plearn(network, mini_batch, mini_expected, learning_rate):
  gradient_w_list, gradient_b_list = reset_gradient_list(network)
  network, mini_batch, mini_expected, gradient_w_list, gradient_b_list = fori_loop(0, len(mini_batch), apply_back_prop, (network, mini_batch, mini_expected, gradient_w_list, gradient_b_list))

  gradient_w_list = pmean(gradient_w_list, axis_name="mini_batch")
  gradient_b_list = pmean(gradient_b_list, axis_name="mini_batch")

  return apply_all_gradients(network, gradient_w_list, gradient_b_list, learning_rate / len(mini_batch))

def ptrain(network, batch, expected_outputs, # training information
          learning_rate = .001, epochs = 10, batch_size = 1000, # meta parameters
          train_percent = 1, set_aside_training_data = True, test_batch = None, test_outputs = None, # testing information
          device_count = 1): # device information
  all_loss = []
  all_acc = []

  if set_aside_training_data:
    train_batch = batch[0 : int(jnp.floor(len(batch) * train_percent))]
    train_outputs = expected_outputs[0 : int(jnp.floor(len(expected_outputs) * train_percent))]
    test_batch = batch[int(jnp.floor(len(batch) * train_percent)) : len(batch)]
    test_outputs = expected_outputs[int(jnp.floor(len(expected_outputs) * train_percent)) : len(expected_outputs)]
  elif test_batch == None or test_outputs == None:
    print("No testing data, but no training data was set aside either (if you want no data to be set aside, use train_percent = 1 instead).")
    return all_loss, all_acc, network
  else:
    train_batch = batch
    train_outputs = expected_outputs

  replicas = replicate(network, device_count)

  for epoch in range(epochs):
    for pos in range(0, len(train_batch), batch_size * device_count):
      mini_batch_split = split(train_batch[pos : jnp.minimum(pos + batch_size * device_count, len(train_batch))], device_count)
      mini_outputs_split = split(train_outputs[pos : jnp.minimum(pos + batch_size * device_count, len(train_batch))], device_count)

      replicas = plearn(replicas, mini_batch_split, mini_outputs_split, learning_rate)

    all_loss.append(avg_cost(network, batch, expected_outputs))
    all_acc.append(accuracy(network, test_batch, test_outputs))
  return all_acc, all_loss, replicas[0]

In [None]:
# vectorized training functions
@partial(vmap, axis_name="mini_batch")
def vlearn(network, mini_batch, mini_expected, learning_rate):
  gradient_w_list, gradient_b_list = reset_gradient_list(network)
  network, mini_batch, mini_expected, gradient_w_list, gradient_b_list = fori_loop(0, len(mini_batch), apply_back_prop, (network, mini_batch, mini_expected, gradient_w_list, gradient_b_list))

  gradient_w_list = pmean(gradient_w_list, axis_name="mini_batch")
  gradient_b_list = pmean(gradient_b_list, axis_name="mini_batch")

  return apply_all_gradients(network, gradient_w_list, gradient_b_list, learning_rate / len(mini_batch))

def vtrain(network, batch, expected_outputs, # training information
          learning_rate = .001, epochs = 10, batch_size = 1000, # meta parameters
          train_percent = 1, set_aside_training_data = True, test_batch = None, test_outputs = None, # testing information
          process_count = 8): # device information
  all_loss = []
  all_acc = []

  if set_aside_training_data:
    train_batch = batch[0 : int(jnp.floor(len(batch) * train_percent))]
    train_outputs = expected_outputs[0 : int(jnp.floor(len(expected_outputs) * train_percent))]
    test_batch = batch[int(jnp.floor(len(batch) * train_percent)) : len(batch)]
    test_outputs = expected_outputs[int(jnp.floor(len(expected_outputs) * train_percent)) : len(expected_outputs)]
  elif test_batch == None or test_outputs == None:
    print("No testing data, but no training data was set aside either (if you want no data to be set aside, use train_percent = 1 instead).")
    return all_loss, all_acc, network
  else:
    train_batch = batch
    train_outputs = expected_outputs

  replicas = replicate(network, process_count)

  for epoch in range(epochs):
    for pos in range(0, len(train_batch), batch_size * process_count):
      mini_batch_split = split(train_batch[pos : jnp.minimum(pos + batch_size * process_count, len(train_batch))], process_count)
      mini_outputs_split = split(train_outputs[pos : jnp.minimum(pos + batch_size * process_count, len(train_batch))], process_count)

      print(replicas.shape[0], mini_batch_split.shape[0], mini_outputs_split.shape[0])

      replicas = vlearn(replicas, mini_batch_split, mini_outputs_split, learning_rate)

    all_loss.append(avg_cost(network, train_batch, train_outputs))
    all_acc.append(accuracy(network, test_batch, test_outputs))
  return all_acc, all_loss, replicas[0]

In [None]:
def preprocess_google_mnist(file):
  with open(file, newline = '') as csv_file:
    reader = csv.reader(csv_file, delimiter = ',', quotechar = '"')
    
    y_list = []
    x_list = []

    for row in reader:
      y = jnp.zeros(shape = 10)
      y = y.at[int(row[0])].set(1)

      y_list.append(y)
      x_list.append(jnp.array([int(x) for x in row[1 :]]) / 255)

    return jnp.array(y_list), jnp.array(x_list)

In [None]:
training_output, training_data = preprocess_google_mnist("./sample_data/mnist_train_small.csv")
testing_output, testing_data = preprocess_google_mnist("./sample_data/mnist_test.csv")

In [None]:
key, net = make_network(key, shape = (784, 100, 10), activations = (RELU, SIGMOID), error = (MEAN_SQUARED, MEAN_SQUARED))

In [None]:
devices()

[StreamExecutorGpuDevice(id=0, process_index=0)]

In [None]:
numbert = net
# print(deconstruct(numbert))

accuracy_list, loss_list, numbert = train(numbert, training_data, training_output, learning_rate = .3, epochs = 3000, batch_size = 100,
                                                 set_aside_training_data = False, test_batch = testing_data, test_outputs = testing_output)

plt.figure(figsize=(9, 9), dpi=80)

plt.subplot(2, 1, 1)
plt.plot(accuracy_list)
plt.title("Numbert's Accuracy")
plt.ylim(0, 1)

plt.subplot(2, 1, 2)
plt.plot(loss_list)
plt.title("Numbert's Loss")
plt.show()

In [None]:
numbert = net
# print(deconstruct(numbert))

accuracy_list, loss_list, numbert = train(numbert, training_data, training_output, learning_rate = .05, epochs = 3000, batch_size = 100,
                                                 set_aside_training_data = False, test_batch = testing_data, test_outputs = testing_output)

plt.figure(figsize=(9, 9), dpi=80)

plt.subplot(2, 1, 1)
plt.plot(accuracy_list)
plt.title("Numbert's Accuracy")
plt.ylim(0, 1)

plt.subplot(2, 1, 2)
plt.plot(loss_list)
plt.title("Numbert's Loss")
plt.show()


In [None]:
numbert = net
# print(deconstruct(numbert))

accuracy_list, loss_list, numbert = train(numbert, training_data, training_output, learning_rate = .3, epochs = 3000, batch_size = 50,
                                                 set_aside_training_data = False, test_batch = testing_data, test_outputs = testing_output)

plt.figure(figsize=(9, 9), dpi=80)

plt.subplot(2, 1, 1)
plt.plot(accuracy_list)
plt.title("Numbert's Accuracy")
plt.ylim(0, 1)

plt.subplot(2, 1, 2)
plt.plot(loss_list)
plt.title("Numbert's Loss")
plt.show()

In [None]:
numbert = net
# print(deconstruct(numbert))

accuracy_list, loss_list, numbert = train(numbert, training_data, training_output, learning_rate = .05, epochs = 3000, batch_size = 50,
                                                 set_aside_training_data = False, test_batch = testing_data, test_outputs = testing_output)

plt.figure(figsize=(9, 9), dpi=80)

plt.subplot(2, 1, 1)
plt.plot(accuracy_list)
plt.title("Numbert's Accuracy")
plt.ylim(0, 1)

plt.subplot(2, 1, 2)
plt.plot(loss_list)
plt.title("Numbert's Loss")
plt.show()