In [29]:
import math

def sigmoid_activation(x):
    return 1 / (1 + math.exp(-x))


def softmax_activation(x):
    exp_x = [math.exp(i) for i in x]
    sum_exp_x = sum(exp_x)
    return [i / sum_exp_x for i in exp_x]


def loss_(predictions: list, targets: list, clipping=1e-15):
    # Clipping because inputs can't be 0
    return -sum(
        math.log(predictions[i] + clipping) * target[i] for i in range(len(targets))
    )


def sigmoid_derivative(x):
    sig = sigmoid_activation(x)
    return sig * (1 - sig)


def softmax_derivative(x, index):
    s = softmax_activation(x)
    return s[index] * (1 - s[index])


def forward_pass(inputs, weights_l1, bias_in, weights_l2, bias_out):
    # Layer 1
    nodes_l1 = [
        sum(weights_l1[j][i] * inputs[j] for j in range(len(inputs))) + bias_in[i]
        for i in range(len(bias_in))
    ]
    l1_sigmoid = [sigmoid_activation(x) for x in nodes_l1]

    # Layer 2
    nodes_l2 = [
        sum(weights_l2[j][i] * l1_sigmoid[j] for j in range(len(l1_sigmoid)))
        + bias_out[i]
        for i in range(len(bias_out))
    ]
    predictions = softmax_activation(nodes_l2)
    return nodes_l1, l1_sigmoid, nodes_l2, predictions


def backward_pass(
    inputs, target, nodes_l1, l1_sigmoid, nodes_l2, predictions, learning_rate=0.01
):
    # Error at output
    delta_out = [
        predictions[i] - target[i] for i in range(len(target))
    ]  # Use len(target) here to avoid index error

    # Error at hidden layer
    delta_l2 = [
        sum(delta_out[k] * weights_l2[i][k] for k in range(len(delta_out))) for i in range(len(weights_l2))
    ]

    delta_weights_2 = [
        [delta_out[k] * l1_sigmoid[i] for k in range(len(delta_out))] for i in range(len(l1_sigmoid))
    ]

    delta_sigmoid = [
        [delta_l2[k] * (l1_sigmoid[k] * (1 - l1_sigmoid[k])) for k in range(len(l1_sigmoid))]
    ]

    delta_l1 = [
        sum(delta_sigmoid[0][k] * weights_l1[i][k] for k in range(len(delta_sigmoid[0]))) for i in range(len(weights_l1))
    ]

    delta_weights_1 = [
        [inputs[i] * delta_sigmoid[0][k] for k in range(len(delta_sigmoid[0]))] for i in range(len(inputs))
    ]

    # Update weights and biases
    # Layer 2 weights and biases
    for i in range(len(weights_l2)):
        for j in range(len(weights_l2[i])):
            weights_l2[i][j] -= learning_rate * delta_weights_2[i][j]

    for i in range(len(bias_out)):
        bias_out[i] -= learning_rate * delta_out[i]

    # Layer 1 weights and biases
    for i in range(len(weights_l1)):
        for j in range(len(weights_l1[i])):
            weights_l1[i][j] -= learning_rate * delta_weights_1[i][j]

    for i in range(len(bias_in)):
        bias_in[i] -= learning_rate * delta_sigmoid[0][i]

    return weights_l1, weights_l2, bias_out, bias_in, delta_weights_1, delta_weights_2


# Running the training step
inputs = [1, -1]
weights_l1 = [[1, 1, 1], [-1, -1, -1]]
bias_in = [0, 0, 0]
weights_l2 = [[1, 1], [-1, -1], [-1, -1]]
bias_out = [0, 0]
target = [1, 0]

nodes_l1, l1_sigmoid, nodes_l2, predictions = forward_pass(
    inputs, weights_l1, bias_in, weights_l2, bias_out
)
loss = loss_(predictions, target)
weights_l1, weights_l2, bias_out, bias_in, delta_weights_1, delta_weights_2 = backward_pass(inputs, target, nodes_l1, l1_sigmoid, nodes_l2, predictions)
delta_weights_1, delta_weights_2

([[0.0, 0.0, 0.0], [-0.0, -0.0, -0.0]],
 [[-0.44039853898894116, 0.44039853898894116],
  [-0.44039853898894116, 0.44039853898894116],
  [-0.44039853898894116, 0.44039853898894116]])