# Lab 1 (Re)Introduction to Neural Networks


## I. Supervised Learning - Classification

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
import torchaudio
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class FashionMNISTModel(nn.Module):
    def __init__(self):
        super(FashionMNISTModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

In [None]:
def train(model, device, train_loader, optimizer, epoch, log_interval):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


In [None]:
batch_size = 64
test_batch_size = 1000
epochs = 14
lr = 1.0
gamma = 0.7
no_cuda = False
no_mps = False
seed = 1
log_interval = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])
dataset1 = datasets.MNIST('../data', train=True, download=True,
                    transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
                    transform=transform)
train_loader = DataLoader(dataset1, batch_size)
test_loader = DataLoader(dataset2, test_batch_size)

model = FashionMNISTModel().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=lr)

scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch, log_interval)
    test(model, device, test_loader)
    scheduler.step()

## II. Unsupervised Learning - Dimensionality reduction

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

# Define the Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        # Building an linear encoder with Linear
        # layer followed by Relu activation function
        # 784 ==> 9
        self.encoder = torch.nn.Sequential(
          torch.nn.Linear(28 * 28, 128),
          torch.nn.ReLU(True),
          torch.nn.Linear(128, 64),
          torch.nn.ReLU(True),
          torch.nn.Linear(64, 36),
          torch.nn.ReLU(True),
          torch.nn.Linear(36, 18),
          torch.nn.ReLU(True),
          torch.nn.Linear(18, 9)
        )

        # Building an linear decoder with Linear
        # layer followed by Relu activation function
        # The Sigmoid activation function
        # outputs the value between 0 and 1
        # 9 ==> 784
        self.decoder = torch.nn.Sequential(
          torch.nn.Linear(9, 18),
          torch.nn.ReLU(True),
          torch.nn.Linear(18, 36),
          torch.nn.ReLU(True),
          torch.nn.Linear(36, 64),
          torch.nn.ReLU(True),
          torch.nn.Linear(64, 128),
          torch.nn.ReLU(True),
          torch.nn.Linear(128, 28 * 28),
          torch.nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Data transformation and loading
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.view(-1))  # Flatten the 28x28 images
])
train_dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Initialize model, loss function, and optimizer
model = Autoencoder()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_epochs = 20
losses = []
for epoch in range(num_epochs):
    epoch_loss = 0
    for data in train_loader:
        img, _ = data
        img = img.view(-1, 28*28)  # Flatten images to 784-dimensional vectors
        output = model(img)
        loss = criterion(output, img)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)  # Average loss for the epoch
    losses.append(avg_loss)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Plotting loss over epochs
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), losses, marker='o', color='b', label='Training Loss')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

# Visualization function
def show_images(original, reconstructed):
    fig, axes = plt.subplots(2, 10, figsize=(15, 4))
    for i in range(10):
        axes[0, i].imshow(original[i].reshape(28, 28), cmap='gray')
        axes[0, i].axis('off')
        axes[1, i].imshow(reconstructed[i].reshape(28, 28), cmap='gray')
        axes[1, i].axis('off')
    plt.show()

# Test and visualize reconstructed images
test_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
images, _ = next(iter(test_loader))
images = images.view(-1, 28*28)
with torch.no_grad():
    reconstructed_images = model(images)
show_images(images.numpy(), reconstructed_images.numpy())


## III. Usage of pre-trained models - Speech Recognition

In [None]:
import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

In [None]:
import IPython
import matplotlib.pyplot as plt
from torchaudio.utils import download_asset

SPEECH_FILE = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")

In [None]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H

print("Sample Rate:", bundle.sample_rate)

print("Labels:", bundle.get_labels())

In [None]:
model = bundle.get_model().to(device)

print(model.__class__)

In [None]:
IPython.display.Audio(SPEECH_FILE)

In [None]:
waveform, sample_rate = torchaudio.load(SPEECH_FILE)
waveform = waveform.to(device)

if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

In [None]:
with torch.inference_mode():
    features, _ = model.extract_features(waveform)

In [None]:
fig, ax = plt.subplots(len(features), 1, figsize=(16, 4.3 * len(features)))
for i, feats in enumerate(features):
    ax[i].imshow(feats[0].cpu(), interpolation="nearest")
    ax[i].set_title(f"Feature from transformer layer {i+1}")
    ax[i].set_xlabel("Feature dimension")
    ax[i].set_ylabel("Frame (time-axis)")
fig.tight_layout()

In [None]:
with torch.inference_mode():
    emission, _ = model(waveform)

In [None]:
plt.imshow(emission[0].cpu().T, interpolation="nearest")
plt.title("Classification result")
plt.xlabel("Frame (time-axis)")
plt.ylabel("Class")
plt.tight_layout()
print("Class labels:", bundle.get_labels())

In [None]:
class GreedyCTCDecoder(torch.nn.Module):
    def __init__(self, labels, blank=0):
        super().__init__()
        self.labels = labels
        self.blank = blank

    def forward(self, emission: torch.Tensor) -> str:
        """Given a sequence emission over labels, get the best path string
        Args:
          emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.

        Returns:
          str: The resulting transcript
        """
        indices = torch.argmax(emission, dim=-1)  # [num_seq,]
        indices = torch.unique_consecutive(indices, dim=-1)
        indices = [i for i in indices if i != self.blank]
        return "".join([self.labels[i] for i in indices])

In [None]:
decoder = GreedyCTCDecoder(labels=bundle.get_labels())
transcript = decoder(emission[0])

In [None]:
print(transcript)
IPython.display.Audio(SPEECH_FILE)

## IV - DIY Implementation - Artificial Neuron Model

In [None]:
import numpy as np
def ReLU(x):
    return np.maximum(0, x)

# Layer size
input_size = 2
neuron_units = 10

# Initialization
x = np.random.randn(input_size, 1)
weights = np.random.randn(neuron_units, input_size)
biases = np.random.randn(neuron_units, 1)

# Forward propagation
activation = np.dot(weights, x) + biases
y = ReLU(activation)

print(y)

In [None]:
'''
GOAL: Build a ANN network with 2 layers
* 1 layer:
    * Input size: 10
    * Output size, (aka. Neuron units; aka. layer size): 10
    * Activation function (for all units): ReLU
* 2 layer:
    * Input size: (same as previous layer size, here: 10)
    * Output size: 2
    * Activation function: Softmax
'''
import numpy as np
from typing import Tuple
from collections import namedtuple

# =============================== #

# Let's start with definitions of some activation functions
# ReLU with derivative for the 1st layer

def ReLU(Z):
    return np.maximum(0, Z)

def ReLU_deriv(Z):
    return Z > 0

# And Softmax for the 2nd layer
# taken from https://stackoverflow.com/a/54977170

def softmax(Z):
    e = np.exp(Z)
    return e / np.sum(e, axis=1)

def softmax_deriv(Z):
    # Reshape the 1-d softmax to 2-d so that np.dot will do the matrix multiplication
    Z_reshaped = Z.reshape(-1,1)
    return np.diagflat(Z_reshaped) - np.dot(Z_reshaped, Z_reshaped.T)

# =============================== #

# Now, we need to somehowe randomly initizalize weights & biases.
# Let's start with some preoparation, and define namedtuples for passing our network parameters
Layer = namedtuple('Layer', ['weights', 'bias'])
ANN = namedtuple('ANN', ['layer_1', 'layer_2'])

# Then, for simplicity, let's initialize our network in naive way
def init_ann_params(input_size: int, layer_1_size: int, layer_2_size: int) -> ANN:

    # The weights matrix must have dimensions equal to sizes of the input and the number of neural units
    weights_1 = np.random.rand(layer_1_size, input_size) - 0.5
    # The bias vector will have a one parameter (the bias) for each neural unit
    bias_1 = np.random.rand(layer_1_size, 1)- 0.5
    # Let's pack it into our namedtuple
    layer_1 = Layer(weights=weights_1, bias=bias_1)

    # Let's do it analogously with 2nd layer.
    # The input size must mach the size of the previous layer.
    # The output size of this layer will be also the output size of the whole network.
    weights_2 = np.random.rand(layer_2_size, layer_1_size)- 0.5
    bias_2 = np.random.rand(layer_2_size, 1)- 0.5
    layer_2 = Layer(weights=weights_2, bias=bias_2)

    # Finally, return our fresh network, packed in our namedtuple
    return ANN(layer_1, layer_2)

# =============================== #

# It's time for Forward Propagation definition
def forward_prop(ann: ANN, input_x: np.array) -> Tuple[np.array, np.array, np.array, np.array]:

    # Retrieve parameters for layer 1
    layer_1 = ann.layer_1
    # Using matrix notation multiply weights matrix by input vector and add bias vector for whole layer 1
    sum_1 = layer_1.weights.dot( input_x ) + layer_1.bias
    # Then use the sum vector (each element consist of sum for each neuron in this layer) as input for activation function
    activate_1 = ReLU(sum_1)

    # Moving forward to the 2nd layer - retrieve parameters
    layer_2 = ann.layer_2
    # Use the output of the 1st layer as input for the matrix calcucations in 2nd layer
    sum_2 = layer_2.weights.dot( input_x ) + layer_2.bias
    # Use the sum vector with the activation function of the last layer
    activate_2 = softmax(sum_2)

    # Return the sums and the activations results
    # (The sums will be needed to calculate derivatives of the activation functions!)
    return sum_1, activate_1, sum_2, activate_2

# =============================== #
# After the forward propagation it's time for
# The creme de la creme of the implementation - Backward Propagation

# For the input we will need:
#  1) sums and activations from the forward propagation,
#  2) the weights of the all previous layers
#  3) input sample with corresponding output sample (just one pair of samples!)
def backward_prop(sum_1: np.array,
                  activate_1: np.array,
                  sum_2: np.array,
                  activate_2: np.array,
                  ann: ANN,
                  input_x: np.array,
                  output_y: np.array) -> Tuple[np.array, np.array, np.array, np.array]:

    # First, calculate the derivative of the cost function
    # (Remember - it will always return a scalar value! )
    cost_deriv = 2 * np.sum(activate_2 - output_y)

    # Let's use it to calculate updates (derivatives) of the weights & biases in the last layer

    # Studying the equations of the backprog, we can observe, that the bias term is slightly easier to implement
    # What is more, *we can reuse it* in calculations of weights and the layer's input derivatives!
    # NOTE: The result is a vector!
    bias_2_deriv = softmax_deriv(sum_2) * cost_deriv

    # In addition, to obtain the derivative of the weights we need just to multiply the bias deriv by the layer's input!
    # NOTE: Using 2 vectors we want to obtain result with dimensions of weights matrix!
    weights_2_deriv = activate_1.dot( bias_2_deriv.transpose() )

    # Now, lest propagate the gradient to the first layer by calculating the derivative of the cost function in terms of the input
    layer_2 = ann.layer_2
    activate_1_deriv = layer_2.weights.dot( bias_2_deriv )

    # We can now calculate the derivatives for the 1st layer, reusing the above logic
    bias_1_deriv = ReLU_deriv(sum_1) * activate_1_deriv
    weights_1_deriv = input_x.dot( bias_1_deriv.transpose() )
    # NOTE: We don't want to calculate the derivative of the input of the wole network
    # (i.e. we don't want to pass the gradient to the dataset)

    # Return calculated updates (derivaites) for the weights and biases
    return weights_1_deriv, bias_1_deriv, weights_2_deriv, bias_2_deriv


# =============================== #

# Now lets define function for updating network parameters - weights & biases
# The alpha paramaeters is a learning rate
# Let's update the params in naive way for further simplification
def update_params(ann: ANN,
                  weights_1_deriv: np.array,
                  bias_1_deriv: np.array,
                  weights_2_deriv: np.array,
                  bias_2_deriv: np.array,
                  alpha: float) -> ANN:
    layer_1 = ann.layer_1
    layer_1.weights = layer_1.weights - alpha * weights_1_deriv
    layer_1.bias = layer_1.bias - alpha * bias_1_deriv

    layer_2 = ann.layer_2
    layer_2.weights = layer_2.weights - alpha * weights_2_deriv
    layer_2.bias = layer_2.bias - alpha * bias_2_deriv

    # Return the ANN with updated weightes & biases
    return ANN(layer_1, layer_2)

# =============================== #

# Finally, let's define Gradient Descent to call above functions
def gradient_descent(input_x: np.array,
                     output_y: np.array,
                     alpha: float,
                     iterations: int) -> ANN:

    # First, initialize network with random parameters
    ann = init_ann_params()
    for _ in range(iterations):
        # 1) Forward Propragation
        sum_1, activate_1, sum_2, activate_2 = forward_prop(ann, input_x)
        # 2) Backwards Propagation
        weights_1_deriv, bias_1_deriv, weights_2_deriv, bias_2_deriv = backward_prop(sum_1,
                                                                                     activate_1,
                                                                                     sum_2,
                                                                                     activate_2,
                                                                                     ann,
                                                                                     input_x,
                                                                                     output_y)
        # 3) Update parameters
        ann = update_params(ann,
                            weights_1_deriv,
                            bias_1_deriv,
                            weights_2_deriv,
                            bias_2_deriv,
                            alpha)

    # Return the updated network
    return ann

Part 2: Basic implementation

In [None]:
import numpy as np
from typing import Callable, Tuple
import matplotlib.pyplot as plt

def activation_function(x: float) -> float:
    return 1 / (1 + np.exp(-x))

def activation_function_deriv(x: float) -> float:
    sigmoid = activation_function(x)
    return sigmoid * (1 - sigmoid)

class Neuron:
    def __init__(self, input_size: int, act_func: Callable, act_func_deriv: Callable):
        self._init_weights_and_bias(input_size)
        self._activation_function = act_func
        self._activation_function_deriv = act_func_deriv

    def _init_weights_and_bias(self, input_size: int):
        self.weights = np.random.randn(input_size, 1)
        self.bias = np.random.randn()

    def __call__(self, x: np.array) -> np.array:
        return self._forward_propagation(x)

    def _forward_propagation(self, x: np.array) -> np.array:
        z = np.dot(x, self.weights) + self.bias
        return self._activation_function(z).flatten()

    def gradient_descent(self, x: np.array, y_target: np.array, alpha: float, iterations: int) -> None:
        for _ in range(iterations):
            grad_w, grad_b = self._backward_propagation(x, y_target)
            self.weights -= alpha * grad_w
            self.bias -= alpha * grad_b

    def _backward_propagation(self, x: np.array, y: np.array) -> Tuple[np.array, np.array]:
        z = np.dot(x, self.weights) + self.bias
        y_pred = self._activation_function(z).flatten()

        error = y_pred - y

        grad_z = error * self._activation_function_deriv(z).flatten()
        grad_w = np.dot(x.T, grad_z) / x.shape[0]
        grad_b = np.mean(grad_z)

        return grad_w.reshape(self.weights.shape), grad_b


## Part 3: Artificial Neuron as binary clasifier
A single neuron used as binary classifier is also known as *perceptron*, frequently used as building block for *dense* layer. It can be used for logistic regression.

### Task 3.1
1) Using your Neuron class construct a following ANN:
  * Input size: 2
  * 1 layer with 1 unit with any activation function
  * Output size: 1

2) Perform separate trainings on provided datasets of truth tables of logic gates. You can experiment with number of iterations (start with n=500) and learnining rate (start with alpha = 0.1)

3) Visualize each dataset and ANN's result (a regression line, as function of two inputs).

4) Comment results

In [None]:
def train_and_visualize(dataset_x, dataset_y, title):
    X = np.array(dataset_x)
    y = np.array(dataset_y)

    neuron = Neuron(input_size=2, act_func=activation_function, act_func_deriv=activation_function_deriv)

    alpha = 0.1
    iterations = 500
    neuron.gradient_descent(X, y, alpha, iterations)

    plt.figure(figsize=(6, 6))
    plt.title(f"{title} Gate Decision Boundary")
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap="bwr", marker="o", edgecolor="k", s=100, label="Data Points")

    x0_vals = np.linspace(-0.5, 1.5, 100)
    x1_vals = np.linspace(-0.5, 1.5, 100)
    x0_grid, x1_grid = np.meshgrid(x0_vals, x1_vals)
    grid_points = np.c_[x0_grid.ravel(), x1_grid.ravel()]

    grid_outputs = np.array([neuron(point) for point in grid_points]).reshape(x0_grid.shape)

    plt.contourf(x0_grid, x1_grid, grid_outputs, levels=[0, 0.5, 1], cmap="bwr", alpha=0.3)
    plt.colorbar()
    plt.xlabel("Input 1")
    plt.ylabel("Input 2")
    plt.legend()
    plt.show()

dataset_or_x = ((0,0), (0,1), (1,0), (1,1))
dataset_or_y = (0, 1, 1, 1)

dataset_and_x = ((0,0), (0,1), (1,0), (1,1))
dataset_and_y = (0, 0, 0, 1)

dataset_xor_x = ((0,0), (0,1), (1,0), (1,1))
dataset_xor_y = (0, 1, 1, 0)

train_and_visualize(dataset_or_x, dataset_or_y, "OR")
train_and_visualize(dataset_and_x, dataset_and_y, "AND")
train_and_visualize(dataset_xor_x, dataset_xor_y, "XOR")

**OR Gate**: The neuron learned the OR logic, as it is linearly separable. The decision boundary separates the region where at least one input is 1.

**AND Gate**: The neuron also succeed with the AND gate, which is also linearly separable. The boundary include only the (1, 1) point in the "1" region.

**XOR Gate**: The neuron struggle with XOR, as XOR is not linearly separable.
A single-layer neuron cannot accurately model XOR, so the decision boundary will be inadequate. For XOR, a multi-layer network is needed.

## Part 4: Multilayer perceptron
More neurons can be stacked together to model nonlinear properties.  

### Task 4.1
In this task you have to implement following ANN:
* Input size: 2
* 1 layer with 2 units with sigmoid activation function
* 1 layer with 1 unit with sigmoid activation function
* Output size: 1
    
Your Neuron class was not designed for ambitious merging of weights and biases during the gradient descent, nor for passing outputs to perform forward propagation. To overcome such inconvenience, please manually define dataflow and method calling for all Neurons. You can expand provided example.

In [None]:
class NeuralNetwork():
    def __init__(self, input_size: int, act_func: Callable, act_func_deriv: Callable):
        self._neuron_1 = Neuron(input_size, act_func, act_func_deriv)
        self._neuron_2 = Neuron(input_size, act_func, act_func_deriv)
        self._neuron_3 = Neuron(input_size, act_func, act_func_deriv)

    def __call__(self, x: np.array) -> float:
        return self._network_forward_propagation(x)

    def _network_forward_propagation(self, x: np.array) -> float:
        input_3_1 = self._neuron_1(x)
        input_3_2 = self._neuron_2(x)
        input_3 = np.column_stack((input_3_1, input_3_2))
        return self._neuron_3(input_3)

    def _network_backwards_propagation(self, x: np.array, y: np.array) -> Tuple[Tuple[np.array, np.array], Tuple[np.array, np.array], Tuple[np.array, np.array]]:
        input_3_1 = self._neuron_1(x)
        input_3_2 = self._neuron_2(x)
        input_3 = np.column_stack((input_3_1, input_3_2))
        y_pred = self._neuron_3(input_3)

        error_3 = y_pred - y

        grad_w_3, grad_b_3 = self._neuron_3._backward_propagation(input_3, y)

        grad_z_3 = error_3 * self._neuron_3._activation_function_deriv(np.dot(input_3, self._neuron_3.weights) + self._neuron_3.bias).flatten()
        grad_input_3_1 = grad_z_3 * self._neuron_3.weights[0]
        grad_input_3_2 = grad_z_3 * self._neuron_3.weights[1]

        grad_w_1, grad_b_1 = self._neuron_1._backward_propagation(x, grad_input_3_1)
        grad_w_2, grad_b_2 = self._neuron_2._backward_propagation(x, grad_input_3_2)

        return (grad_w_1, grad_b_1), (grad_w_2, grad_b_2), (grad_w_3, grad_b_3)

    def gradient_descent(self, x: np.array, y: np.array, alpha: float, iterations: int) -> None:
        for _ in range(iterations):
            (grad_w_1, grad_b_1), (grad_w_2, grad_b_2), (grad_w_3, grad_b_3) = self._network_backwards_propagation(x, y)

            self._neuron_1.weights -= alpha * grad_w_1
            self._neuron_1.bias -= alpha * grad_b_1

            self._neuron_2.weights -= alpha * grad_w_2
            self._neuron_2.bias -= alpha * grad_b_2

            self._neuron_3.weights -= alpha * grad_w_3
            self._neuron_3.bias -= alpha * grad_b_3


### Task 4.2
1) Train your ANN created in task 4.1 on the XOR dataset. You can experiment with number of iterations (start with n=500) and learning rate (start with alpha=0.1).

2) Visualize the dataset and ANN's result (a regression line, as function of two inputs).

3) Comment results.

In [None]:
dataset_xor_x = ((0,0), (0,1), (1,0), (1,1))
dataset_xor_y = (0, 1, 1, 0)

X = np.array(dataset_xor_x)
y = np.array(dataset_xor_y)

neuron = NeuralNetwork(input_size=2, act_func=activation_function, act_func_deriv=activation_function_deriv)

alpha = 0.01
iterations = 10000
neuron.gradient_descent(X, y, alpha, iterations)

plt.figure(figsize=(6, 6))
plt.title("XOR Gate Decision Boundary")
plt.scatter(X[:, 0], X[:, 1], c=y, cmap="bwr", marker="o", edgecolor="k", s=100, label="Data Points")

x0_vals = np.linspace(-0.5, 1.5, 100)
x1_vals = np.linspace(-0.5, 1.5, 100)
x0_grid, x1_grid = np.meshgrid(x0_vals, x1_vals)
grid_points = np.c_[x0_grid.ravel(), x1_grid.ravel()]

grid_outputs = np.array([neuron(point) for point in grid_points]).reshape(x0_grid.shape)

plt.contourf(x0_grid, x1_grid, grid_outputs, levels=[0, 0.5, 1], cmap="bwr", alpha=0.3)
plt.colorbar()
plt.xlabel("Input 1")
plt.ylabel("Input 2")
plt.legend()
plt.show()

The neural networks works better but still not good enough.

In [8]:
import numpy as np
from typing import Callable

class Neuron:
    def __init__(self, input_size: int, activation_function: Callable, activation_function_deriv: Callable):
        self._input_size = input_size
        self._activation_function = activation_function
        self._activation_function_deriv = activation_function_deriv
        self._init_weights_and_bias()

    def _init_weights_and_bias(self):
        # Initialize weights and biases randomly
        self._weights = np.random.randn(self._input_size)
        self._bias = np.random.randn()

    def __call__(self, x: np.array) -> np.array:
        return self._forward_propagation(x)[1]  # Return the activated output only

    def _forward_propagation(self, x: np.array):
        # Linear transformation
        z = np.dot(self._weights, x) + self._bias
        # Activation
        a = self._activation_function(z)
        return z, a  # Return both linear output and activated output for backpropagation

    def _backward_propagation(self, dz: float, x: np.array):
        # Compute gradients of weights and bias
        dw = dz * x
        db = dz
        return dw, db

    def _update_weights_and_bias(self, dw: np.array, db: float, alpha: float):
        # Gradient descent step
        self._weights -= alpha * dw
        self._bias -= alpha * db


class NeuralNetwork():
    def __init__(self, input_size: int, act_func: Callable, act_func_deriv: Callable):
        # Initialize neurons
        self._neuron_1 = Neuron(input_size, act_func, act_func_deriv)
        self._neuron_2 = Neuron(input_size, act_func, act_func_deriv)
        self._neuron_3 = Neuron(2, act_func, act_func_deriv)

    def __call__(self, x: np.array) -> float:
        return self._network_forward_propagation(x)

    def _network_forward_propagation(self, x: np.array) -> float:
        input_3_1 = self._neuron_1(x)
        input_3_2 = self._neuron_2(x)
        input_3 = np.array([input_3_1, input_3_2])
        return self._neuron_3(input_3)

    def _network_backwards_propagation(self, x: np.array, y: float, alpha: float) -> None:
        # Forward pass to get outputs
        z1, a1 = self._neuron_1._forward_propagation(x)
        z2, a2 = self._neuron_2._forward_propagation(x)
        input_3 = np.array([a1, a2])
        z3, a3 = self._neuron_3._forward_propagation(input_3)

        # Calculate output error
        error_3 = a3 - y
        dz3 = error_3 * self._neuron_3._activation_function_deriv(z3)

        # Gradients for neuron 3
        dw3, db3 = dz3 * input_3, dz3
        self._neuron_3._update_weights_and_bias(dw3, db3, alpha)

        # Backpropagate to neuron 1 and neuron 2
        dz1 = dz3 * self._neuron_3._weights[0] * self._neuron_1._activation_function_deriv(z1)
        dw1, db1 = dz1 * x, dz1
        self._neuron_1._update_weights_and_bias(dw1, db1, alpha)

        dz2 = dz3 * self._neuron_3._weights[1] * self._neuron_2._activation_function_deriv(z2)
        dw2, db2 = dz2 * x, dz2
        self._neuron_2._update_weights_and_bias(dw2, db2, alpha)

    def gradient_descent(self, x: np.array, y: float, alpha: float, iterations: int) -> None:
        for _ in range(iterations):
              self._network_backwards_propagation(x, y, alpha)

In [None]:
def train_nn(nn: NeuralNetwork, dataset_x: np.array, dataset_y: np.array, alpha: float, epochs: int) -> None:
    for _ in range(epochs):
        for x, y in zip(dataset_x, dataset_y):
            x = np.array(x)
            nn.gradient_descent(x, y, alpha, 10)

In [9]:
import numpy as np
import wandb
from typing import Callable

# Assuming NeuralNetwork and related classes are already defined

def train_nn_with_logging(nn: NeuralNetwork, 
                          dataset_x: np.array, 
                          dataset_y: np.array, 
                          alpha: float, 
                          epochs: int, 
                          validation_split: float = 0.1):
    # Initialize wandb
    wandb.init(project="neural-network-training", name="basic-nn-training", config={
        "alpha": alpha,
        "epochs": epochs,
        "validation_split": validation_split,
        "input_size": len(dataset_x[0]),
    })

    # Split dataset into training and validation sets
    split_index = int(len(dataset_x) * (1 - validation_split))
    train_x, val_x = dataset_x[:split_index], dataset_x[split_index:]
    train_y, val_y = dataset_y[:split_index], dataset_y[split_index:]

    for epoch in range(epochs):
        # Training
        training_loss = 0.0
        for x, y in zip(train_x, train_y):
            x = np.array(x)
            nn.gradient_descent(x, y, alpha, 10)
            prediction = nn(x)
            training_loss += (prediction - y) ** 2

        training_loss /= len(train_x)

        # Validation
        validation_loss = 0.0
        if len(val_x) > 0:
            for x, y in zip(val_x, val_y):
                prediction = nn(np.array(x))
                validation_loss += (prediction - y) ** 2

            validation_loss /= len(val_x)

        # Log metrics
        metrics = {"training_loss": training_loss}
        if len(val_x) > 0:
            metrics["validation_loss"] = validation_loss

        wandb.log(metrics, step=epoch)

    # Save code as an artifact
    artifact = wandb.Artifact("nn_training_code", type="code")
    artifact.add_file("(Re)Introduction_to_Neural_Networks.ipynb")  # Replace with your script's filename
    wandb.log_artifact(artifact)

    wandb.finish()


In [10]:
# Example data
dataset_x = np.random.rand(100, 3)  # 100 samples, 3 features
dataset_y = np.random.rand(100)    # 100 target values

# Define activation functions
def activation_function(x: float) -> float:
    return 1 / (1 + np.exp(-x))

def activation_function_deriv(x: float) -> float:
    sigmoid = activation_function(x)
    return sigmoid * (1 - sigmoid)

nn = NeuralNetwork(input_size=3, act_func=activation_function, act_func_deriv=activation_function_deriv)

train_nn_with_logging(nn, dataset_x, dataset_y, alpha=0.01, epochs=50)


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
training_loss,█▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
validation_loss,█▄▂▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄

0,1
training_loss,0.08444
validation_loss,0.12511


In [None]:
def plot_decision_boundary(nn: NeuralNetwork, X: np.array, Y: np.array):
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))

    grid = np.c_[xx.ravel(), yy.ravel()]
    predictions = np.array([nn(np.array(point)) for point in grid])
    predictions = predictions.reshape(xx.shape)

    plt.contourf(xx, yy, predictions, levels=[0, 0.5, 1], cmap="coolwarm", alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolor="k", cmap="coolwarm", s=100)
    plt.xlabel("X1")
    plt.ylabel("X2")
    plt.title("Decision Boundary of Neural Network for XOR Problem")
    plt.show()


In [None]:
dataset_xor_x = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
dataset_xor_y = np.array([0, 1, 1, 0])

neuron = NeuralNetwork(input_size=2, act_func=activation_function, act_func_deriv=activation_function_deriv)

alpha = 0.01
iterations = 10000
epochs = 10000

train_nn(neuron,dataset_xor_x, dataset_xor_y,  alpha, epochs)
plot_decision_boundary(neuron, dataset_xor_x, dataset_xor_y)