In [2]:
import numpy as np

In [3]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def cross_entropy(y_pred, y_true):
    return -(y_true @ np.log(y_pred))

def d_cross_entropy(y, y_pred):
    return -(y - y_pred)

def softmax(L):
    arr = []
    for i in L:
        arr.append(np.exp(i) / np.exp(L).sum())

    return arr

def dsigmoid(x):
    return sigmoid(x) * sigmoid(1- x)

In [4]:
def initialize_parameters(inputs, hidden_layers, outputs):
    parameters = {}
    parameters["W1"] = np.random.rand(hidden_layers[0], inputs)
    parameters["b1"] = np.random.rand(hidden_layers[0])
    for i in range(1, len(hidden_layers)):
        parameters[f"W{i+1}"] = np.random.rand(hidden_layers[i], hidden_layers[i - 1])
        parameters[f"b{i+1}"] = np.random.rand(hidden_layers[i])
    parameters[f"W{len(hidden_layers) + 1}"] = np.random.rand(outputs, hidden_layers[-1])
    parameters[f"b{len(hidden_layers) + 1}"] = np.random.rand(outputs)
    return parameters

In [9]:
parameters = initialize_parameters(3, [3], 3)

In [None]:
W1 = np.array([
    [0.5488135, 0.71518937, 0.60276338],
    [0.54488318, 0.4236548, 0.64589411],
    [0.43758721, 0.891773, 0.96366276]
])
W2 = np.array([
    [0.56804456, 0.92559664, 0.07103606],
    [0.0871293, 0.0202184, 0.83261985],
    [0.77815675, 0.87001215, 0.97861834]
])
W3 = np.array([
    [0.11827443, 0.63992102, 0.14335329],
    [0.94466892, 0.52184832, 0.41466194],
    [0.26455561, 0.77423369, 0.45615033]
])

b1 = np.array(
    [0.38344152, 0.79172504, 0.52889492]
)

b2 = np.array(
    [0.7991586, 0.46147936, 0.52889492]
)

b3 = np.array(
    [0.56843395, 0.0187898, 0.6176355]
)

X = np.array(
    [[1, 0, 1]]
)

Y = np.array(
    [[0, 0, 1]]
)

In [None]:
parameters = {
    'W1': W1,
    'W2': W2,
    'W3': W3,
    'b1': b1,
    'b2': b2,
    'b3': b3,
}

In [7]:
def forward_propogation(parameters, x, g, O):
    activations = {}
    activations["a1"] = parameters["W1"] @ x + parameters["b1"]
    activations["h1"] = g(activations["a1"])
    for i in range(2, len(parameters) // 2):
        activations[f"a{i}"] = parameters[f"W{i}"] @ activations[f"h{i - 1}"] + parameters[f"b{i}"]
        activations[f"h{i}"] = g(activations[f"a{i}"])

    activations[f"a{len(parameters) // 2}"] = parameters[f"W{len(parameters) // 2}"] @ activations[f"h{len(parameters) // 2 - 1}"] + parameters[f"b{len(parameters) // 2}"]
    y_pred = O(activations[f"a{len(parameters) // 2}"])
    return y_pred, activations

In [11]:
X = np.array(
    [[1, 0, 1]]
)

Y = np.array(
    [[0, 0, 1]]
)

In [12]:
forward_propogation(parameters, X, sigmoid, softmax)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 1 is different from 3)

In [None]:
def backPropogation(y_pred, y, activations, g_dash, O_dash, parameters, x):
    losses = {}
    n = len(parameters) // 2
    m = len(activations) // 2
    La = O_dash(y, y_pred)

    Lh = La @ parameters[f"W{n}"]

    da = g_dash(activations[f"a{m}"])
    losses[f"LW{n}"] = np.outer(La, activations[f"h{m}"])
    losses[f"Lb{n}"] = La.copy()




    for i in range(1, m):
        La = Lh * da
        Lh = La @ parameters[f"W{m - i + 1}"]
        da = g_dash(activations[f"a{m - i}"])
        losses[f"LW{m - i + 1}"] = np.outer(La, activations[f"h{m - i}"])
        losses[f"Lb{m - i + 1}"] = La.copy()

    La = Lh * da
    losses["LW1"] = np.outer(La, x)
    losses["Lb1"] = La.copy()


    return losses

In [None]:
y_pred, activations = forward_propogation(parameters, x, sigmoid, softmax)

In [None]:
gradient_loss = backPropogation(y_pred, y, activations, dsigmoid, d_cross_entropy, parameters, x)

In [None]:
def gradient_descent(parameters, losses, eta):
    new_parameters = {}
    for i in range(len(parameters) // 2):
        new_parameters[f"W{i + 1}"] = parameters[f"W{i + 1}"] - eta * losses[f"LW{i + 1}"]
        new_parameters[f"b{i + 1}"] = parameters[f"b{i + 1}"] - eta * losses[f"Lb{i + 1}"]
    return new_parameters

In [None]:
new_parameters = gradient_descent(parameters, gradient_loss, 1)

In [None]:
y_pred, activations = forward_propogation(new_parameters, x, sigmoid, softmax)

In [None]:
cross_entropy(y_pred, y)

0.07711344018879235

# Backpropogation code

In [22]:
from scipy.special import expit

# Helper Functions
def sigmoid(x):
    return expit(x)

def cross_entropy(y_pred, y_true, epsilon=1e-15):
    # Clip y_pred to avoid values close to 0 or 1
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -(y_true @ np.log(y_pred))

def d_cross_entropy(y, y_pred):
    return -(y - y_pred)

def softmax(L):
    exp_L = np.exp(L - np.max(L))  # Subtract the maximum value to avoid overflow
    return exp_L / exp_L.sum(axis=0, keepdims=True)  # Compute softmax along the specified axis

def dsigmoid(x):
    return sigmoid(x) * sigmoid(1- x)

# Initialize Parameters

def initialize_parameters(inputs, hidden_layers, outputs):
    parameters = {}
    parameters["W1"] = np.random.rand(hidden_layers[0], inputs)
    parameters["b1"] = np.random.rand(hidden_layers[0])
    for i in range(1, len(hidden_layers)):
        parameters[f"W{i+1}"] = np.random.rand(hidden_layers[i], hidden_layers[i - 1])
        parameters[f"b{i+1}"] = np.random.rand(hidden_layers[i])
    parameters[f"W{len(hidden_layers) + 1}"] = np.random.rand(outputs, hidden_layers[-1])
    parameters[f"b{len(hidden_layers) + 1}"] = np.random.rand(outputs)
    return parameters

# Forward Propogation

def forward_propogation(parameters, x, g, O):
    activations = {}

    activations["a1"] = parameters["W1"] @ x + parameters["b1"]
    print(parameters["W1"].shape, x.shape)
    activations["h1"] = g(activations["a1"])
    for i in range(2, len(parameters) // 2):
        activations[f"a{i}"] = parameters[f"W{i}"] @ activations[f"h{i - 1}"] + parameters[f"b{i}"]
        activations[f"h{i}"] = g(activations[f"a{i}"])

    activations[f"a{len(parameters) // 2}"] = parameters[f"W{len(parameters) // 2}"] @ activations[f"h{len(parameters) // 2 - 1}"] + parameters[f"b{len(parameters) // 2}"]
    y_pred = O(activations[f"a{len(parameters) // 2}"])
    return y_pred, activations

# Backpropogation

def backPropogation(y_pred, y, activations, g_dash, O_dash, parameters, x):
    losses = {}
    n = len(parameters) // 2
    m = len(activations) // 2
    La = O_dash(y, y_pred)


    Lh = La @ parameters[f"W{n}"]

    da = g_dash(activations[f"a{m}"])
    losses[f"W{n}"] = np.outer(La, activations[f"h{m}"])
    losses[f"b{n}"] = La.copy()




    for i in range(1, m):
        La = Lh * da
        Lh = La @ parameters[f"W{m - i + 1}"]
        da = g_dash(activations[f"a{m - i}"])
        losses[f"W{m - i + 1}"] = np.outer(La, activations[f"h{m - i}"])
        losses[f"b{m - i + 1}"] = La.copy()

    La = Lh * da
    losses["W1"] = np.outer(La, x)
    losses["b1"] = La.copy()


    return losses

# Gradient Descent (Batch)
def gradient_descent(parameters, losses, eta):
    for i in range(len(parameters) // 2):
        parameters[f"W{i + 1}"] = parameters[f"W{i + 1}"] - eta * losses[f"W{i + 1}"]
        parameters[f"b{i + 1}"] = parameters[f"b{i + 1}"] - eta * losses[f"b{i + 1}"]
    return parameters

def sgd(parameters, losses, eta):
    new_parameters = {}
    num_samples = len(losses)  # Number of data points or samples

    # Iterate over each sample and update parameters using its corresponding gradient
    for i in range(num_samples):
        for j in range(1, len(parameters) // 2 + 1):  # Iterate over layers
            new_parameters[f"W{j}"] = parameters[f"W{j}"] - eta * losses[f"W{j}"]
            new_parameters[f"b{j}"] = parameters[f"b{j}"] - eta * losses[f"b{j}"]

    return new_parameters


def train(X, Y, epochs, hidden_layers, g, O, g_dash, O_dash, eta, optimizer):
    parameters = initialize_parameters(X[0].shape[0], hidden_layers, Y[0].shape[0])
    print(parameters)
    for _ in range(epochs):
        total_gradient_loss = {key: 0 for key in parameters.keys()}  # Initialize total gradient loss
        for x, y in zip(X, Y):  # Iterate over each data point
            y_pred, activations = forward_propogation(parameters, x, g, O)
            gradient_loss = backPropogation(y_pred, y, activations, g_dash, O_dash, parameters, x)
            # Accumulate the gradients for each data point
            for key in total_gradient_loss:
                total_gradient_loss[key] += gradient_loss.get(key, 0)
        # Update the parameters using the accumulated gradients
        parameters = optimizer(parameters, total_gradient_loss, eta)
    return parameters

def evaluate(X, parameters, g, O):
    return np.array(forward_propogation(parameters, X, g, O)[0])


In [23]:
parameters = train(X, Y, 300, [3], sigmoid, softmax, dsigmoid, d_cross_entropy, 50, sgd)

{'W1': array([[0.28869735, 0.23567597, 0.11877703],
       [0.30184999, 0.87898168, 0.23652918],
       [0.72732875, 0.83935795, 0.419922  ]]), 'b1': array([0.12565607, 0.26643362, 0.08139086]), 'W2': array([[0.38396647, 0.7600736 , 0.54402482],
       [0.08819219, 0.5653798 , 0.66892124],
       [0.01872262, 0.16715281, 0.0605212 ]]), 'b2': array([0.67908367, 0.20217154, 0.85491273])}


In [25]:
def initialize_parameters(inputs, hidden_layers, outputs):
    parameters = {}
    parameters["W1"] = np.random.rand(hidden_layers[0], inputs)
    parameters["b1"] = np.random.rand(hidden_layers[0])
    for i in range(1, len(hidden_layers)):
        parameters[f"W{i+1}"] = np.random.rand(hidden_layers[i], hidden_layers[i - 1])
        parameters[f"b{i+1}"] = np.random.rand(hidden_layers[i])
    parameters[f"W{len(hidden_layers) + 1}"] = np.random.rand(outputs, hidden_layers[-1])
    parameters[f"b{len(hidden_layers) + 1}"] = np.random.rand(outputs)

    # Initialize AdaM parameters
    v = {}
    s = {}
    for key in parameters.keys():
        if key.startswith("W") or key.startswith("b"):
            v[key] = np.zeros_like(parameters[key])
            s[key] = np.zeros_like(parameters[key])

    return parameters, v, s

def adam(parameters, v, s, losses, eta, beta1=0.9, beta2=0.999, epsilon=1e-8, t=0):
    for key in parameters.keys():
        if key.startswith("W") or key.startswith("b"):
            # Compute gradients
            gradient = losses[key]

            # Update time step
            t += 1

            # Update biased first moment estimate
            v[key] = beta1 * v[key] + (1 - beta1) * gradient
            # Update biased second moment estimate
            s[key] = beta2 * s[key] + (1 - beta2) * (gradient ** 2)

            # Correct the bias in the first moment
            v_corrected = v[key] / (1 - beta1 ** t)
            # Correct the bias in the second moment
            s_corrected = s[key] / (1 - beta2 ** t)

            # Update parameters
            parameters[key] -= eta * v_corrected / (np.sqrt(s_corrected) + epsilon)

    return parameters, v, s

def train(X, Y, epochs, hidden_layers, g, O, g_dash, O_dash, eta, optimizer):
    parameters, v, s = initialize_parameters(X[0].shape[0], hidden_layers, Y[0].shape[0])

    for _ in range(epochs):
        for x, y in zip(X, Y):
            y_pred, activations = forward_propogation(parameters, x, g, O)
            gradient_loss = backPropogation(y_pred, y, activations, g_dash, O_dash, parameters, x)
            parameters, v, s = optimizer(parameters, v, s, gradient_loss, eta)

    return parameters

# Usage example:
# parameters = train(X, Y, epochs, hidden_layers, sigmoid, softmax, dsigmoid, d_cross_entropy, eta, adam)


In [26]:

parameters = train(X, Y, 300, [3], sigmoid, softmax, dsigmoid, d_cross_entropy, 50, adam)

In [27]:
parameters

{'W1': array([[ 5.02904837e+02,  1.98871266e-01,  5.02161448e+02],
        [-5.01262445e+02,  8.23084735e-02, -5.02013289e+02],
        [ 5.03151126e+02,  6.26234032e-01,  5.02717998e+02]]),
 'b1': array([ 373.53031037, -372.92542623,  374.33764426]),
 'W2': array([[-320.82486349, -319.87119613, -320.78648288],
        [-320.04887885, -319.91046457, -320.08023504],
        [ 321.53481382,  321.53971701,  321.54160138]]),
 'b2': array([-291.47668637, -290.92223191,  292.09833532])}

In [None]:
X[0].shape, Y[0].shape

((3,), (3,))

In [None]:
Y_pred = []
for x in X:
    y_pred = list(evaluate(x, parameters, sigmoid, softmax))
    Y_pred.append(y_pred)
Y_pred = np.array(Y_pred)

In [None]:
Y, Y_pred

(array([[0, 0, 1]]), array([[0., 0., 1.]]))

# Testing phase

In [None]:
# import numpy as np

# # Original data
# y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 0])

# # Create X such that the index of the number of y is maximum in X
# X = np.zeros((len(y), 10))  # Initialize X with zeros

# beta = 0.85
# for i in range(len(y)):
#     max_index = y[i]  # Index of the number in y
#     X[i, max_index] = beta  # Set the highest value in X
#     X[i] += (1 - beta) / 9 # Add a small value to all elements to make sure they are not zero

# print("X:")
# print(X)
# print("Shape of X:", X.shape)
# print("y:", y)
# num_classes = 10
# y_one_hot = np.eye(num_classes)[y]
# y_one_hot

import numpy as np

X = np.array([
    [0.1, 0.5, 0.2, 0.9, 0.3, 0.6, 0.4, 0.8, 0.7, 0.2],
    [0.3, 0.2, 0.7, 0.4, 0.6, 0.8, 0.9, 0.5, 0.1, 0.3],
    [0.8, 0.9, 0.4, 0.1, 0.5, 0.2, 0.7, 0.6, 0.3, 0.9],
    [0.6, 0.2, 0.3, 0.7, 0.4, 0.1, 0.8, 0.9, 0.5, 0.6],
    [0.5, 0.1, 0.8, 0.2, 0.9, 0.7, 0.3, 0.4, 0.6, 0.5],
    [0.7, 0.3, 0.6, 0.5, 0.8, 0.4, 0.1, 0.2, 0.9, 0.7],
    [0.2, 0.8, 0.1, 0.6, 0.4, 0.5, 0.7, 0.3, 0.2, 0.8],
    [0.9, 0.6, 0.5, 0.3, 0.7, 0.2, 0.4, 0.1, 0.8, 0.6],
    [0.4, 0.7, 0.9, 0.8, 0.2, 0.3, 0.6, 0.5, 0.1, 0.4],
    [0.5, 0.3, 0.7, 0.4, 0.1, 0.8, 0.2, 0.6, 0.7, 0.5]
])

y = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0])

print("X shape:", X.shape)
print("Original y:", y)

# Convert y to one-hot encoded vectors
num_classes = len(np.unique(y))
y_one_hot = np.eye(num_classes)[y]

print("One-hot encoded y:")
print(y_one_hot)


X shape: (10, 10)
Original y: [1 0 1 0 1 0 1 0 1 0]
One-hot encoded y:
[[0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]]


In [None]:
parameters = train(X, y_one_hot, 3000, [10], sigmoid, softmax, dsigmoid, d_cross_entropy, 0.01, adam)

In [None]:
parameters

{'W1': array([[ -2.51072087,   6.19766604,   0.67532679,  -0.99178004,
           4.26468957,  -1.61609701,  -5.57679556,   1.00271536,
           1.64213245,   1.07632908],
        [ -2.44288028,  -3.39721088,   2.04884379,  -7.6637196 ,
           4.30587898,   1.2795474 ,  -4.59169589,   1.53730854,
          -0.14477385,  -0.09671343],
        [ -5.68646646,   2.71630869,   1.48319004,  -2.61143038,
           5.52352841,   2.18021446,   4.77360728,   2.98010979,
          -4.7730624 ,  -2.89451556],
        [ -2.43524232,  -3.18139909,   1.45336449,  -7.33256972,
           4.26204423,   1.40242954,  -4.66638533,   1.52323293,
          -0.25857428,  -0.06945312],
        [ -5.72020058,   3.42649823,   1.68299755,  -2.60292743,
           5.22512958,   3.04507946,   3.69233077,   2.52104526,
          -5.11112125,  -2.54772547],
        [ -2.62817088,  -3.05082959,   1.91350118,  -7.32940427,
           4.36178001,   0.64396745,  -4.49615928,   2.00979363,
          -0.1294903 ,  

In [None]:
Y_pred = []
for x in X:
    y_pred = list(evaluate(x, parameters, sigmoid, softmax))
    Y_pred.append(y_pred)
Y_pred = np.array(Y_pred)

In [None]:
Y_pred, y_one_hot

(array([[1.06046168e-06, 9.99998940e-01],
        [9.99996379e-01, 3.62106986e-06],
        [7.16625520e-07, 9.99999283e-01],
        [9.99999647e-01, 3.52673575e-07],
        [3.49575888e-06, 9.99996504e-01],
        [9.99998245e-01, 1.75539432e-06],
        [2.44181581e-07, 9.99999756e-01],
        [9.99999870e-01, 1.29693404e-07],
        [7.91537154e-07, 9.99999208e-01],
        [9.99999282e-01, 7.17695289e-07]]),
 array([[0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.]]))

In [None]:
for i in Y_pred:
    print(i.argmax())

1
0
1
0
1
0
1
0
1
0
