In [6]:
from neural_new import *
np.random.seed(1826)

from itertools import product

cancer = load_breast_cancer()

data = cancer.data
target = cancer.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2)

# Scale data
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.fit_transform(X_test)

We will now begin a broad test to find the most optimal parameters w.r.t. accuracy, hence the cost-function in this case will be the cross-entropy cost function.
The following is simply a collection of lists and arrays which contain parameter options which will be tested for.

In [2]:
weight_initialization = ['he', 'xavier', 'none', 'homemade'] # None => normally distributed
activation_functions = ['sigmoid', 'relu', 'leaky_relu']
learning_rates = [10**n for n in range(-6, 1)]
batch_sizes = [2**n for n in range(5)]

layer_sizes = np.arange(10, 40+1, 10)
max_layers = 1

layers = []
for length in range(1, max_layers + 1):
    l = []
    for elem in product(layer_sizes, repeat = length):
        l.append(list(elem))

    layers.append(l)

epochs = np.floor(np.linspace(1e3, 1e4, 6)).astype(int)

lambdas = np.logspace(-4, 1, 6)
cost_function = 'accuracy'
method = 'classification'

In [3]:
shape = [84] #  Number of hidden layer and node combinations tested
shape += [len(weight_initialization), len(activation_functions), len(learning_rates),\
         len(batch_sizes), len(epochs), len(lambdas)]

print(f"Dimensionality of matrix: {tuple(shape)}")
prod = 1
for elem in shape:
    prod *= elem
print(f"Total elements: {prod}, same memory allocation as a square {np.ceil(np.sqrt(prod))}-dim matrix (approx)")

Dimensionality of matrix: (84, 4, 3, 7, 5, 6, 6)
Total elements: 1270080, same memory allocation as a square 1127.0-dim matrix (approx)


Our monsterous for-loop may now commence.

It was at this moment (21:26) Daniel realized that training a Neural Network 1270080 times would be quite unfeasable. The operation was stopped and after some minor changes and he went home at 22:02.

In [4]:
'''
acc_scores = np.zeros(tuple(shape))

for i in range(3):
    for hl_i, hl in enumerate(layers[i]):
        for wi_i, wi in enumerate(weight_initialization):
            for af_i, af in enumerate(activation_functions):
                for eta_i, eta in enumerate(learning_rates):
                    for bs_i, bs in enumerate(batch_sizes):
                        for ep_i, ep in enumerate(epochs):
                            for l_i, lambd in enumerate(lambdas):
                                
                                if i == 1:
                                    hl_i = 4 + hl_i
                                elif i == 2:
                                    hl_i = 20 + hl_i
                                    
                                print(f"{hl_i}/{shape[0]}, {wi_i}/{shape[1]}, {af_i}/{shape[2]}, {eta_i}/{shape[3]}, {bs_i}/{shape[4]}, {ep_i}/{shape[5]}, {l_i}/{shape[6]}")
                                    
                                network = NeuralNetwork(X_train_s, y_train, X_test_s, y_test,\
                                                       hl, ep, bs, eta, lambd, af,\
                                                       cost_func='accuracy', dataset='classification',\
                                                       weight_init_method=wi)
                                
                                network.model_training("SGD", plot="no")
                                
                                pred = network.prediction(X_test_s)

                                print(pred)
                                print(pred.round())
                                print(pred.shape)
                                print(y_test.reshape(-1, 1).shape)
                                test_score = np.sum(pred.round() == y_test.reshape(1, -1)) / len(y_test)
                                print(f"Accuracy: {np.sum(pred.round() == y_test)}/{len(y_test)} = {test_score}")
                                
                                acc_scores[hl_i, wi_i, af_i, eta_i, bs_i, ep_i, l_i] = test_score
print("Matrix complete.")
'''       

0/84, 0/4, 0/3, 0/7, 0/5, 0/6, 0/6


KeyboardInterrupt: 

In the end we wish to find the optimal learning rate and hyperparameter $\lambda$, so lets find a proper combination of the other parameters first. This way we ensure that the the optimal $\eta$-$\lambda$ combination will be a good approximation to the true optimal parameter-combination.

Starting with the optimal layer and node combination, keeping every other parameter constant. 

We choose the sigmoid function as the activiation function and xavier as the weight initialization. $\eta$ = 0.01, $\lambda$ = 0, batch size = 1 and epochs = 1000.

Starting with a fixed amount of nodes in each layer, 20.

In [17]:
hl = [[30], [30, 30], [30, 30, 30]]
trained_models = []
for layer in hl:
    network = NeuralNetwork(X_train_s, y_train, X_test_s, y_test,\
                            layer, 1000, 1, 0.01, 0, 'sigmoid',\
                            cost_func='accuracy', dataset='classification',\
                            weight_init_method='xavier')
    print(f"Initialized with layerstructure: {layer}, commencing training")
    network.model_training("SGD", plot='no')
    trained_models.append(network)

Initialized with layerstructure: [30], commencing training
Initialized with layerstructure: [30, 30], commencing training
Initialized with layerstructure: [30, 30, 30], commencing training


In [23]:
probabilities = []
pred = []
accs = []
target = y_test.reshape(-1, 1)
print(target.shape)
for i in range(len(hl)):
    #print(trained_models[i].hidden_nodes)
    probabilities.append(trained_models[i].prediction(X_test_s))
    pred.append(probabilities[i].round())
    accs.append(np.sum(pred[i] == target))

print(accs)
#trained_models[0].prediction(X_test_s)
#pred = probabilities.round()
#print(pred)
#print(sum(pred))

(114, 1)
[112, 112, 112]


Seeing as there is no difference when it comes to the number of layers, the hiddel layer structure [30, 30] will be used henceforth. That is two hidden layers each containing 30 nodes. 

Lets now test the other combination 