In [1]:
import numpy as np
import copy
# import matplotlib.pyplot as plt
import warnings

# Suppress RuntimeWarning
warnings.filterwarnings("ignore", category=RuntimeWarning)


class Node():
    def __init__(self):
        self.next_layer_nodes = {}
        self.inputs = {}
        self.activation = 0
        self.delta = 0
    
    def add_next_layer_node(self, node, weight):
        self.next_layer_nodes[node] = weight

    # def add

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def build_neural_network(layers, features, units, random_weights=True):
    # layers: number of layers
    # units: number of units in each layer
    neural_network = {}
    layer_node = []
    nn_next_layer_nodes = []

    for i in range(layers, -1, -1):
        neural_network[i] = []
        if i == layers:
            output_node = Node()
            neural_network[i].append(output_node)
            nn_next_layer_nodes.append(output_node)
        elif i > 0:
            for j in range(units + 1):
                node = Node()
                neural_network[i].append(node)
                for next_layer_node in nn_next_layer_nodes:
                    w = 0
                    if random_weights:
                        w = np.random.normal(size=1)[0]
                    node.next_layer_nodes[next_layer_node] = (w, 0)
                if j == 0:
                    node.activation = 1
                    continue
                layer_node.append(node)
            nn_next_layer_nodes = []
            for node in layer_node:
                nn_next_layer_nodes.append(node)
            layer_node.clear()
        else:
            for j in range(features + 1):
                node = Node()
                neural_network[i].append(node)
                for next_layer_node in nn_next_layer_nodes:
                    w = 0
                    if random_weights:
                        w = np.random.normal(size=1)[0]
                    node.next_layer_nodes[next_layer_node] = (w, 0)
                if j == 0:
                    node.activation = 1
    sorted_nn_dict = {k: neural_network[k] for k in sorted(neural_network)}

    return sorted_nn_dict

def forward_propagation(neural_network, inputs):
    #append 1 to the beginning of the inputs
    inputs = np.insert(inputs, 0, 1)

    for i in range(len(inputs)):
        neural_network[0][i].activation = inputs[i]

    for layer_idx, layer in neural_network.items():
        if layer_idx == 0:
            # print("input layer to hidden layer")
            input_layer_to_hidden(layer)
            # print("1::checking transfers to hidden layer")
            # for idx, node in enumerate(layer):
            #     print("     node: ", idx)
            #     for nl_node in node.next_layer_nodes:
            #         print("     next layer node: ", nl_node)
            #         print("     next layer node input at index: ", idx, " is: ", nl_node.inputs[idx])
        elif layer_idx == len(neural_network) - 1:
            # print("hidden to output layer")
            hidden_layer_to_output(layer)
        else:
            # print("hidden to hidden layer")
            hidden_layer_to_hidden(layer)
    # output = neural_network[len(neural_network) - 1][0].activation
    # print(output)
    return neural_network

def input_layer_to_hidden(layer):

    for idx, node in enumerate(layer):

        for nl_node in node.next_layer_nodes:
            input = node.activation
            weight = node.next_layer_nodes[nl_node][0]
            output = input * weight
            nl_node.inputs[idx] = output
            nl_node.activation = output

def hidden_layer_to_hidden(layer):
    for idx, node in enumerate(layer):
        # print("     node: ", idx)
        activation = 0
        if idx != 0:
            sigmoid_input = 0
            for i in node.inputs:
                sigmoid_input += node.inputs[i]
            # print("     sigmoid_input: ", sigmoid_input)
            activation = sigmoid(sigmoid_input)
        else:
            activation = 1
        node.activation = activation
        # print("     activation: ", activation)

        for nl_node in node.next_layer_nodes:
            w = node.next_layer_nodes[nl_node][0]
            output = activation * w
            nl_node.inputs[idx] = output

def hidden_layer_to_output(layer):
    node = layer[0]
    output = 0
    for i in node.inputs:
        output += node.inputs[i]
    node.activation = output
    # print("output", output)

def back_propagation_deltas(neural_network, target):
    for layer_idx in range(len(neural_network) - 1, -1, -1):
        # print("layer: ", layer_idx)
        layer = neural_network[layer_idx]
        if layer_idx == len(neural_network) - 1:
            output_node = layer[0]
            output_node.delta = output_node.activation - target
            # print("output_node.delta: ", output_node.delta)
        elif layer_idx == 0:
            pass
        else:
            for node in layer:
                delta = 0
                for nl_node in node.next_layer_nodes:
                    nl_node_delta = nl_node.delta
                    w = node.next_layer_nodes[nl_node][0]
                    delta += nl_node_delta * w * node.activation * (1 - node.activation)
                node.delta = delta
                # print("node.delta: ", node.delta)
    return neural_network
                    
def compute_gradients(neural_network):
    for layer_idx, layer in neural_network.items():
        # print("layer: ", layer_idx)
        if layer_idx == len(neural_network) - 1:
            continue
        else:
            for idx, node in enumerate(layer):
                # print("node: ", idx)
                for nl_node in node.next_layer_nodes:
                    nl_node_delta = nl_node.delta
                    partial_derivative = nl_node_delta * node.activation
                    # print("partial_derivative: ", partial_derivative)
                    node.next_layer_nodes[nl_node] = (node.next_layer_nodes[nl_node][0], partial_derivative)
    return neural_network

def update_weights(neural_network, learning_rate):
    for layer_idx, layer in neural_network.items():
        if layer_idx == len(neural_network) - 1:
            continue
        else:
            for node in layer:
                for nl_node in node.next_layer_nodes:
                    w = node.next_layer_nodes[nl_node][0]
                    partial_derivative = node.next_layer_nodes[nl_node][1]
                    w = w - learning_rate * partial_derivative
                    node.next_layer_nodes[nl_node] = (w, partial_derivative)
    return neural_network
        
def compute_error(neural_network, X, y):
    error = 0
    for i in range(len(X)):
        neural_network = forward_propagation(neural_network, X[i])
        output_node = neural_network[len(neural_network) - 1][0]
        error += (output_node.activation - y[i]) ** 2
    return error / len(X)



def train_neural_network(neural_network, X, y, learning_rate, d, epochs, tolerance=0.001):
    error = compute_error(neural_network, X, y)
    convergence = False
    iterations = 0
    objective_function_values = []
    objective_function_values.append(error)
    # print("initial error: ", error)

    while not convergence and iterations < epochs:
        # shuffle the data
        shuffle_indices = np.random.permutation(len(X))
        X = X[shuffle_indices]
        y = y[shuffle_indices]
        for j in range(len(X)):
            neural_network = forward_propagation(neural_network, X[j])
            neural_network = back_propagation_deltas(neural_network, y[j])
            neural_network = compute_gradients(neural_network)
            neural_network = update_weights(neural_network, learning_rate)
        current_error = compute_error(neural_network, X, y)
        # print("epoch: ", iterations, " error: ", current_error)
        error_difference = abs(current_error - error)
        if error_difference < tolerance:
            break
        error = current_error
        objective_function_values.append(error)
        iterations += 1
    # plt.plot(objective_function_values)
    # plt.ylabel('objective function value')
    # plt.xlabel('epoch')
    # plt.show()
    return neural_network

        


def predict(neural_network, X):
    predictions = []
    for i in range(len(X)):
        neural_network = forward_propagation(neural_network, X[i])
        output_node = neural_network[len(neural_network) - 1][0]
        predictions.append(output_node.activation)
    return predictions



In [3]:
# neural_network_test = build_neural_network(layers=3, features=2, units=2)

# layer_0_weights = {0: [-1, 1], 1: [-2, 2], 2: [-3, 3]}
# layer_1_weights = {0: [-1, 1], 1: [-2, 2], 2: [-3, 3]}
# layer_2_weights = {0: [-1], 1: [2], 2: [-1.5]}

# new_weights = {0: layer_0_weights, 1: layer_1_weights, 2: layer_2_weights}


# for layer_idx, layer in neural_network_test.items():   
#     for node_idx, node in enumerate(layer):
#         i = 0
#         for nl_node_idx, nl_node in node.next_layer_nodes.items():
#             new_w = new_weights[layer_idx][node_idx][i]
#             node.next_layer_nodes[nl_node_idx] = (new_w, 0)
#             i += 1

# # for layer_idx, layer in neural_network_test.items():
# #     print("layer: ", layer_idx)
# #     for node_idx, node in enumerate(layer):
# #         print("weights")
# #         print("node: ", node_idx)
# #         for nl_node in node.next_layer_nodes:
# #             print(node.next_layer_nodes[nl_node][0])
# x = np.array([[1, 1]])
# y = np.array([-1])

# print("len of x: ", len(x[0])) 

# neural_network_test = train_neural_network(neural_network_test, x, y, learning_rate=0.1, d=1, epochs=100, tolerance=0.00001)
# predictions = predict(neural_network_test, x)
# print(predictions)


# # neural_network_test = forward_propagation(neural_network_test, x)
# # output = neural_network_test[len(neural_network_test) - 1][0].activation
# # print(output)


In [4]:
delta = 0.09182
z =[1, 0.00247, 0.997752]
for i in range(len(z)):
    partial_derivative = delta * z[i]
    print(partial_derivative)

0.09182
0.0002267954
0.09161358864


(b) [17 points] Implement the stochastic gradient descent algorithm to learn the neu-
ral netowrk from the training data. Use the schedule of learning rate: γt = γ0
1+γ0d t.
Initialize the edge weights with random numbers generated from the standard
Gaussian distribution. We restrict the width, i.e., the number of nodes, of
each hidden layer (i.e., Layer 1 & 2 ) to be identical. Vary the width from
{5,10,25,50,100}. Please tune γ0 and d to ensure convergence. Use the curve
of the objective function (along with the number of updates) to diagnosis the
convergence. Don’t forget to shuffle the training examples at the start of each
epoch. Report the training and test error for each setting of the width.



In [2]:

X_training = np.genfromtxt('../data/bank-note/train.csv', delimiter=',')
X_test = np.genfromtxt('../data/bank-note/test.csv', delimiter=',')

y_training = X_training[:, -1]
y_test = X_test[:, -1]

y_training[y_training == 0] = -1
y_test[y_test == 0] = -1

# X_training = np.insert(X_training, 0, 1, axis=1)
# X_test = np.insert(X_test, 0, 1, axis=1)

X_training = X_training[:, :-1]
X_test = X_test[:, :-1]

units = [5,10,25,50,100]
layers = 3
features = 4
learning_rate = 0.1
epochs = 10
tolerance = 0.0001
d=1

for unit in units:
    neural_network = build_neural_network(layers=layers, features=features, units=unit, random_weights=True)
    train_neural_network(neural_network, X_training, y_training, learning_rate=learning_rate, d=d, epochs=epochs, tolerance=tolerance)
    train_predictions = predict(neural_network, X_training)
    test_predictions = predict(neural_network, X_test)
    train_error = 0
    test_error = 0
    for i in range(len(train_predictions)):
        if train_predictions[i] > 0.9:
            train_predictions[i] = 1
        else:
            train_predictions[i] = -1
        if train_predictions[i] != y_training[i]:
            train_error += 1
    for i in range(len(test_predictions)):
        if test_predictions[i] > 0.9:
            test_predictions[i] = 1
        else:
            test_predictions[i] = -1
        if test_predictions[i] != y_test[i]:
            test_error += 1
    print("width: ", unit)
    print("train error: ", train_error / len(train_predictions))
    print("test error: ", test_error / len(test_predictions))
    # print("train predictions: ", train_predictions)




width:  5
train error:  0.009174311926605505
test error:  0.012
width:  10
train error:  0.04472477064220184
test error:  0.052
width:  25
train error:  0.052752293577981654
test error:  0.058
width:  50
train error:  0.02981651376146789
test error:  0.028
width:  100
train error:  0.4461009174311927
test error:  0.442


(c) [10 points]. Now initialize all the weights with 0, and run your training algorithm
again. What is your training and test error? What do you observe and conclude?


In [2]:

X_training = np.genfromtxt('../data/bank-note/train.csv', delimiter=',')
X_test = np.genfromtxt('../data/bank-note/test.csv', delimiter=',')

y_training = X_training[:, -1]
y_test = X_test[:, -1]

y_training[y_training == 0] = -1
y_test[y_test == 0] = -1


X_training = X_training[:, :-1]
X_test = X_test[:, :-1]

units = [5,10,25,50,100]
layers = 3
features = 4
learning_rate = 0.1
epochs = 10
tolerance = 0.0001
d=1

for unit in units:
    neural_network = build_neural_network(layers=layers, features=features, units=unit, random_weights=False)
    train_neural_network(neural_network, X_training, y_training, learning_rate=learning_rate, d=d, epochs=epochs, tolerance=tolerance)
    train_predictions = predict(neural_network, X_training)
    test_predictions = predict(neural_network, X_test)
    train_error = 0
    test_error = 0
    for i in range(len(train_predictions)):
        if train_predictions[i] > 0.9:
            train_predictions[i] = 1
        else:
            train_predictions[i] = -1
        if train_predictions[i] != y_training[i]:
            train_error += 1
    for i in range(len(test_predictions)):
        if test_predictions[i] > 0.9:
            test_predictions[i] = 1
        else:
            test_predictions[i] = -1
        if test_predictions[i] != y_test[i]:
            test_error += 1
    print("width: ", unit)
    print("train error: ", train_error / len(train_predictions))
    print("test error: ", test_error / len(test_predictions))

width:  5
train error:  0.04472477064220184
test error:  0.062
width:  10
train error:  0.14564220183486237
test error:  0.166
width:  25
train error:  0.1926605504587156
test error:  0.206
width:  50
train error:  0.1651376146788991
test error:  0.186
width:  100
train error:  0.34059633027522934
test error:  0.338


(d) [6 points]. As compared with the performance of SVM (and the logistic regression
you chose to implement it; see Problem 3), what do you conclude (empirically)
about the neural network?

The performance of the neural network was very similar to the SVM, but the seems to have performed better if the right about of nodes in the hidden layer was chosen. 

[30 points] Please use PyTorch (or TensorFlow if you want) to fulfill
the neural network training and prediction. Please try two activation functions,
“tanh” and “RELU”. For “tanh”, please use the “Xavier’ initialization; and for
“RELU”, please use the “he” initialization. You can implement these initializa-
tions by yourselves or use PyTorch (or TensorFlow) library. Vary the depth from
{3,5,9} and width from {5,10,25,50,100}. Pleas use the Adam optimizer for
training. The default settings of Adam should be sufficient (e.g., initial learning
rate is set to 10−3). Report the training and test error with each (depth, width)
combination. What do you observe and conclude? Note that, we won’t provide
any link or manual for you to work on this bonus problem. It is YOUR JOB
to search the documentation, find code snippets, test, and debug with PyTorch
(or TensorFlow) to ensure the correct usage. This is what all machine learning
practitioners do in practice.

 install PyTorch in a specific Python environment

In [11]:
import torch
from torch import nn
from torch import optim
import numpy as np
 
class Network(nn.Module):
    def __init__(self, depth, width, input_dim, output_dim, activation):
        super().__init__()
        self.layers = nn.ModuleList()
        self.activation = None
        for i in range(depth):
            if i == 0:
                self.layers.append(nn.Linear(input_dim, width))
            else:
                self.layers.append(nn.Linear(width, width))
        self.layers.append(nn.Linear(width, output_dim))
        
        if activation == 'tanh':
            self.activation = nn.Tanh()
            for layer in self.layers:
                nn.init.xavier_uniform_(layer.weight)
                nn.init.ones_(layer.bias)
        elif activation == 'relu':
            self.activation = nn.ReLU()
            for layer in self.layers:
                nn.init.kaiming_uniform_(layer.weight, nonlinearity='relu')
                nn.init.ones_(layer.bias)
        else:
            raise ValueError('Invalid activation function')
        
    def forward(self, x):
        for layer in self.layers:
            x = self.activation(layer(x))
            # if layer != self.layers[-1]:
            # x = self.activation(x)
        return x


X_training = np.genfromtxt('../data/bank-note/train.csv', delimiter=',')
X_test = np.genfromtxt('../data/bank-note/test.csv', delimiter=',')

y_training = X_training[:, -1]
y_test = X_test[:, -1]


# X_training = np.insert(X_training, 0, 1, axis=1)
# X_test = np.insert(X_test, 0, 1, axis=1)

X_training = X_training[:, :-1]
X_test = X_test[:, :-1]

# Convert numpy arrays to PyTorch tensors
X_training_torch = torch.from_numpy(X_training).float()
y_training_torch = torch.from_numpy(y_training).float().view(-1, 1)
X_test_torch = torch.from_numpy(X_test).float()
y_test_torch = torch.from_numpy(y_test).float().view(-1, 1)

depths = [3,5,9]
widths = [5,10,25, 50, 100]
activations = ['tanh', 'relu']


for depth in depths:
    for width in widths:
        for activation in activations:
            model = Network(depth, width, 4, 1, activation)
            optimizer = optim.Adam(model.parameters())
            loss_fn = nn.BCEWithLogitsLoss()

            for epoch in range(10): 
                optimizer.zero_grad()
                output = model(X_training_torch)
                loss = loss_fn(output, y_training_torch)
                loss.backward()
                optimizer.step()


            with torch.inference_mode():
                print("Test Error")
                output_test = model(X_test_torch)
                predictions_test = torch.round(torch.sigmoid(output_test))
                error_test = (predictions_test != y_test_torch).float().mean()
                print(f'Depth: {depth}, Width: {width}, Activation: {activation}, Error: {error_test.item()}')

                print("Train Error")
                output_training = model(X_training_torch)
                predictions_training = torch.round(torch.sigmoid(output_training))
                error_training = (predictions_training != y_training_torch).float().mean()
                print(f'Depth: {depth}, Width: {width}, Activation: {activation}, Error: {error_training.item()}')
                print("\n")

Test Error
Depth: 3, Width: 5, Activation: tanh, Error: 0.5580000281333923
Train Error
Depth: 3, Width: 5, Activation: tanh, Error: 0.5538991093635559


Test Error
Depth: 3, Width: 5, Activation: relu, Error: 0.5580000281333923
Train Error
Depth: 3, Width: 5, Activation: relu, Error: 0.5538991093635559


Test Error
Depth: 3, Width: 10, Activation: tanh, Error: 0.5580000281333923
Train Error
Depth: 3, Width: 10, Activation: tanh, Error: 0.5538991093635559


Test Error
Depth: 3, Width: 10, Activation: relu, Error: 0.5580000281333923
Train Error
Depth: 3, Width: 10, Activation: relu, Error: 0.5538991093635559


Test Error
Depth: 3, Width: 25, Activation: tanh, Error: 0.2800000011920929
Train Error
Depth: 3, Width: 25, Activation: tanh, Error: 0.2958715558052063


Test Error
Depth: 3, Width: 25, Activation: relu, Error: 0.49399998784065247
Train Error
Depth: 3, Width: 25, Activation: relu, Error: 0.5034403800964355


Test Error
Depth: 3, Width: 50, Activation: tanh, Error: 0.23800000548362

[Bonus] [30 points] We will implement the logistic regression model with stochastic
gradient descent. We will use the dataset “bank-note.zip” in Canvas. Set the maximum
number of epochs T to 100. Don’t forget to shuffle the training examples at the start of
each epoch. Use the curve of the objective function (along with the number of updates)
to diagnosis the convergence. We initialize all the model parameters with 0.