# Imports

In [200]:
import math

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from numpy.random import default_rng
rng = default_rng()

# Question 3: Neural Networks

## Part A: Custom Implementation

Let's begin with the dataset given in the PDF:

In [2]:
feat1 = np.array([4, 4, 1, 2.5, 4.9, 1.9, 3.5, 0.5, 2, 4.5])
feat2 = np.array([2.9, 4, 2.5, 1, 4.5, 1.9, 4, 1.5, 2.1, 2.5])
labels = np.array([1, 1, -1, -1, 1, -1, 1, -1, -1, 1])
df = pd.DataFrame(data=dict({
    "x1": feat1,
    "x2": feat2,
    "y_true": labels 
}))
df.head(10)

Unnamed: 0,x1,x2,y_true
0,4.0,2.9,1
1,4.0,4.0,1
2,1.0,2.5,-1
3,2.5,1.0,-1
4,4.9,4.5,1
5,1.9,1.9,-1
6,3.5,4.0,1
7,0.5,1.5,-1
8,2.0,2.1,-1
9,4.5,2.5,1


Now's the time to make this model:

In [205]:
from dataclasses import dataclass


def sigmoid(x: np.array) -> np.array:
    """
    Given an array of activation values, we return 
    an array of probabilities between 0-1. 

    They will NOT necessarily add up to 1.

    This is mainly intended for classification problems.
    """
    return 1.0 / (1.0 + np.exp(-x))


@dataclass
class BinaryClassificationMLP:
    num_layers: int
    units_per_layer: np.ndarray
    num_features: int = None
    threshold: float = 0.5
    classes: np.ndarray = np.array([0, 1])

    def define_model(self, num_features: int) -> None:
        '''Glorot weight initialization'''
        self.layers = list()
        fan_in = self.num_features
        fan_out_index = 0
        for layer_index in range(self.num_layers):
            # initialize using a randomly sampled uniform distribution
            fan_out = self.units_per_layer[fan_out_index]
            scale = max(1.0, (fan_in + fan_out) / 2.0)
            limit = math.sqrt(3.0 * scale)
            layer_weights = rng.uniform(low=-limit, high=limit, size=(fan_in, fan_out))
            # biases - initialize to zeros, b/c we don't need to break symmetry (unlike for the weights)
            layer_bias = np.zeros((fan_out, 1))
            # activation - use tanh for hidden layers, and sigmoid for the last one
            activation = np.tanh
            if layer_index == self.num_layers - 1:
                activation = sigmoid
            # add to the list, and prep for next iteration
            self.layers.append((layer_weights, layer_bias, activation))
            fan_in = fan_out
            fan_out_index += 1

    def forward(self, X):
        current_activation = X
        layer_activations = list()
        for weight, bias, act_func in self.layers:
            layer_summation = current_activation @ weight + bias.T
            current_activation = act_func(layer_summation)
            layer_activations.append(current_activation)
        return layer_activations

    def backward(self, X, y, learning_rate, activations):
        # variables we're going to need
        weights1, bias1, act1 = self.layers[0]
        weights2, bias2, act2 = self.layers[1]
        per_sample_factor = (1 / X.shape[0])
        hidden_layer_activation, output_layer_activations = (
            activations[0], activations[1]
        )
        output_layer_weights = weights2
        num_samples = X.shape[0]

        # derivatives for the output layer
        y_pred = output_layer_activations
        y_true = np.where(y == -1, 0, 1).reshape(-1, 1)  # labels should be only 0/1
        error = y_pred - y_true
        derivative_y_pred = y_pred * (1 - y_pred)
        grad_output_layer = error * derivative_y_pred
        derivative_output_layer = dW2 = (grad_output_layer.T @ hidden_layer_activation).T
        derivative_output_bias = db2 = (1 / num_samples) * np.sum(error, axis=1, keepdims=True)

        # update weights in output layer before going fwd
        new_output_weights = weights2 - learning_rate * dW2
        new_output_bias = bias2 - learning_rate * bias2

        # derivatives for the hidden layer
        derivative_hidden_activation = z_prime = dZ1 = hidden_layer_activation * (1 - hidden_layer_activation)
        grad_hidden_layer = grad_output_layer @ new_output_weights.T * z_prime
        derivative_hidden_weights = X.T @ grad_hidden_layer
        derivative_hidden_bias = db1 = (1 / num_samples) * np.sum(z_prime, axis=1, keepdims=True)

        # update weights in the hidden layer
        new_hidden_weights = weights1 - learning_rate * derivative_hidden_weights
        new_hidden_bias = bias1 - learning_rate * bias1

        # update the state of the model
        self.layers[0] = (new_hidden_weights, new_hidden_bias, act1)
        self.layers[1] = (new_output_weights, new_output_bias, act2)

    def fit(self, X_train: np.array, y_train: np.array,
            epochs=1000, learning_rate=0.0001) -> None:

        # A: initial state of the network
        num_features = X_train.shape[1]
        self.define_model(num_features)

        layer1, layer2 = self.layers[0], self.layers[1]

        # B: training!
        for _ in range(epochs):
            activations = self.forward(X_train)
            self.backward(X_train, y_train, learning_rate, activations)

    def predict(self, X) -> np.ndarray:
        activations = self.forward(X)
        class1, class2 = self.classes
        y_pred = np.where(activations[1] >= self.threshold, class2, class1)
        return y_pred


Let's apply this model using our specific hyperparams:

In [221]:
num_layers = 2
units_per_layer = [5, 1]
num_features = 2
classes = np.array([-1, 1])
custom_mlp = BinaryClassificationMLP(
    num_layers, units_per_layer, num_features, classes=classes
)

Getting the data is half the fun:

In [222]:
X, y = df[["x1", "x2"]].values, df["y_true"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Training time!

In [223]:
custom_mlp.fit(X_train_scaled, y_train, epochs=300)

In [224]:
accuracy1 = accuracy_score(y_test, custom_mlp.predict(X_test_scaled))
print(f"Accuracy: {accuracy1 * 100}%")

Accuracy: 0.0%


## Part B: `sklearn` Implementation

In [225]:
from sklearn.neural_network import MLPClassifier

In [227]:
sklearn_mlp = MLPClassifier(random_state=42, max_iter=300).fit(X_train_scaled, y_train)
accuracy2 = sklearn_mlp.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy2 * 100}%")

Accuracy: 100.0%




## Part C: A New Classification

In [229]:
X_new = np.array([3, 3]).reshape(1, -1)
y_preds = [model.predict(X_new).squeeze() for model in [custom_mlp, sklearn_mlp]]

print(f"Prediction from Custom: {y_preds[0]}")
print(f"Prediction from Scikit-learn: {y_preds[1]}")

Prediction from Custom: -1
Prediction from Scikit-learn: 1
