<a href="https://colab.research.google.com/github/ShantanuKadam3115/MachineLearningBasics/blob/ML_implementations/3layerDeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np  # Import the math library

# 1. Define the input x
# numpy.array creates a matrix/vector
x = np.array([[1, -1]])

# 2. Define the target y (Class 2)
# We put it in a list [] because usually we have a batch of labels
y = np.array([2])

print("--- Data Setup ---")
print(f"Input x:\n{x}")
print(f"Shape of x: {x.shape} (1 row, 2 columns)")
print(f"Target y: {y}")

--- Data Setup ---
Input x:
[[ 1 -1]]
Shape of x: (1, 2) (1 row, 2 columns)
Target y: [2]


In [8]:
class TalkativeThreeLayerNet:
    # __init__ is the "Constructor". It runs once when you create the object.
    def __init__(self, input_size, h1, h2, output_size):
        self.params = {} # A dictionary to hold our weights

        # Initialize weights with the exact numbers from our paper exercise
        # We use .astype(float) to make sure Python treats them as decimals, not integers
        self.params['W1'] = np.array([[1, 1], [1, -1], [-1, 0]]).astype(float)
        self.params['b1'] = np.zeros(3) # Zeros vector of size 3

        self.params['W2'] = np.array([[1, 0, 0], [0, 0.5, 0], [1, 1, 1]]).astype(float)
        self.params['b2'] = np.zeros(3)

        self.params['W3'] = np.array([[0, 0, 0], [0, 1, 0], [0, -1, 1], [0, 0, -1]]).astype(float)
        self.params['b3'] = np.zeros(4)

    def forward_pass(self, X, verbose=False):
        # Unpack weights from the dictionary for easier typing
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        W3, b3 = self.params['W3'], self.params['b3']

        # --- LAYER 1 ---
        # .dot() is Matrix Multiplication
        # .T means Transpose (flip rows/cols)
        z1 = X.dot(W1.T) + b1
        h1 = np.maximum(0, z1) # ReLU function

        if verbose:
            print(f"\n[Layer 1] Input shape: {X.shape}, W1 shape: {W1.shape}")
            print(f"z1 (Pre-activation):\n{z1}")
            print(f"h1 (Post-ReLU):\n{h1}")

        # --- LAYER 2 ---
        z2 = h1.dot(W2.T) + b2
        h2 = np.maximum(0, z2)

        if verbose:
            print(f"\n[Layer 2] W2 shape: {W2.shape}")
            print(f"z2 (Pre-activation):\n{z2}")
            print(f"h2 (Post-ReLU):\n{h2}")

        # --- LAYER 3 ---
        z3 = h2.dot(W3.T) + b3

        if verbose:
            print(f"\n[Layer 3] W3 shape: {W3.shape}")
            print(f"z3 (Final Scores):\n{z3}")

        # Store values in 'cache' because we need them for Backprop later
        cache = (z1, h1, z2, h2, z3)
        return z3, cache

    def backward_pass(self, X, y, cache, verbose=False):
        # Unpack our saved values
        z1, h1, z2, h2, z3 = cache

        # Unpack weights again
        W1, W2, W3 = self.params['W1'], self.params['W2'], self.params['W3']
        N = X.shape[0] # Number of images (1 in our case)

        # 1. Softmax + Loss Calculation
        # shift values to avoid explosion (math trick)
        shifted = z3 - np.max(z3, axis=1, keepdims=True)
        exp_scores = np.exp(shifted)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        # Calculate Loss
        correct_logprobs = -np.log(probs[range(N), y])
        loss = np.sum(correct_logprobs) / N

        if verbose:
            print("\n--- Backward Pass Starts ---")
            print(f"Probabilities: {probs}")
            print(f"Loss: {loss:.4f}")

        # 2. Gradient Calculation (The Blame Game)
        grads = {}

        # -- Gradient at Output --
        dscores = probs
        dscores[range(N), y] -= 1 # Subtract 1 from correct class
        dscores /= N

        if verbose:
            print(f"dL/dz3 (dscores):\n{dscores}")

        # -- Backprop Layer 3 --
        grads['W3'] = dscores.T.dot(h2)
        dh2 = dscores.dot(W3) # Gradient flowing to hidden layer 2

        # -- Backprop ReLU 2 --
        # (z2 > 0) creates a mask of True/False.
        # Multiplication acts as the "Gate"
        dz2 = dh2 * (z2 > 0)

        if verbose:
            print(f"\ndL/dh2 (Gradient at Hidden 2):\n{dh2}")
            print(f"dL/dz2 (After ReLU Gate):\n{dz2}")

        # -- Backprop Layer 2 --
        grads['W2'] = dz2.T.dot(h1)
        dh1 = dz2.dot(W2)

        # -- Backprop ReLU 1 --
        dz1 = dh1 * (z1 > 0)

        if verbose:
            print(f"\ndL/dh1 (Gradient at Hidden 1):\n{dh1}")
            print(f"dL/dz1 (After ReLU Gate):\n{dz1}")

        # -- Backprop Layer 1 --
        grads['W1'] = dz1.T.dot(X)

        if verbose:
            print(f"\nFinal Result: dL/dW1:\n{grads['W1']}")

        return loss, grads

In [9]:
# 1. Create the network
net = TalkativeThreeLayerNet(input_size=2, h1=3, h2=3, output_size=4)

# 2. Run Forward Pass
# We capture the scores and the 'cache' (memory of intermediate steps)
print(">>> RUNNING FORWARD PASS >>>")
scores, cache = net.forward_pass(x, verbose=True)

# 3. Run Backward Pass
print("\n>>> RUNNING BACKWARD PASS >>>")
loss, grads = net.backward_pass(x, y, cache, verbose=True)

>>> RUNNING FORWARD PASS >>>

[Layer 1] Input shape: (1, 2), W1 shape: (3, 2)
z1 (Pre-activation):
[[ 0.  2. -1.]]
h1 (Post-ReLU):
[[0. 2. 0.]]

[Layer 2] W2 shape: (3, 3)
z2 (Pre-activation):
[[0. 1. 2.]]
h2 (Post-ReLU):
[[0. 1. 2.]]

[Layer 3] W3 shape: (4, 3)
z3 (Final Scores):
[[ 0.  1.  1. -2.]]

>>> RUNNING BACKWARD PASS >>>

--- Backward Pass Starts ---
Probabilities: [[0.15216302 0.41362198 0.41362198 0.02059303]]
Loss: 0.8828
dL/dz3 (dscores):
[[ 0.15216302  0.41362198 -0.58637802  0.02059303]]

dL/dh2 (Gradient at Hidden 2):
[[ 0.          1.         -0.60697105]]
dL/dz2 (After ReLU Gate):
[[ 0.          1.         -0.60697105]]

dL/dh1 (Gradient at Hidden 1):
[[-0.60697105 -0.10697105 -0.60697105]]
dL/dz1 (After ReLU Gate):
[[-0.         -0.10697105 -0.        ]]

Final Result: dL/dW1:
[[ 0.          0.        ]
 [-0.10697105  0.10697105]
 [ 0.          0.        ]]
