# RFA-based NN growth experiment (WIP)
# TODO

In [None]:
import torch


input_dimension = 2
hidden_units = 3

layers = [
    [[1, 0, -1],
     [0, 1, 0]],
    [1, -1, 1]
]

dummy_variable = torch.zeros(hidden_units, hidden_units, requires_grad=True)
dummy_variable.retain_grad()


x = torch.tensor([[1., 0.], [0., 1.], [-1., 0.]])
y = torch.tensor([1., 0., 1.])

input_layer = torch.Tensor(layers[0])
output_layer = torch.Tensor(layers[1])


activations = torch.nn.ReLU()(torch.matmul(x, input_layer))
dummy_term = activations.unsqueeze(1).bmm(activations.mm(dummy_variable).reshape(-1, hidden_units, 1)).reshape(-1)

pred = torch.matmul(activations, torch.Tensor(layers[1])) + dummy_term
pred.retain_grad()

import numpy, torch

from experiment import TwoLayerNeuralNet
from settings.noisy_xor import get_dataloader
from utils.optimization import Accuracy, initialize

initialize(experiment_results['seed'])

data, rotation_matrix = get_dataloader(**experiment_results)
_, (inputs, labels) = next(enumerate(data))

loss_fn = torch.nn.BCEWithLogitsLoss()
loss = loss_fn(pred, y)

loss.backward()
dummy_variable.grad
activations.reshape(-1, hidden_units, 1).bmm(activations.reshape(-1, 1, hidden_units))
numpy.linalg.norm((dummy_variable.grad / max_eigenvalue).detach().cpu().numpy()) ** 2

In [6]:
import torch

a = torch.tensor([[1, 2, 3],
                  [4, 5, 6]])

In [11]:
a[..., :-1]

tensor([[1, 2],
        [4, 5]])

In [None]:
from experiment import TwoLayerNeuralNet
from settings.noisy_xor import get_dataloader
from utils.optimization import Accuracy, initialize

initialize(experiment_results['seed'])

data, rotation_matrix = get_dataloader(**experiment_results)
_, (inputs, labels) = next(enumerate(data))

model = TwoLayerNeuralNet.load('./models/', {**experiment_results, 'run': 0})

print(f'Accuracy: {Accuracy(model(inputs), labels)}')

In [None]:
import numpy, torch

input_dimension = experiment_results['input_dimension']
x = numpy.array([[1., 0.], [0., 1.], [-1., 0.], [0., -1.], [1., -1.]])
x = numpy.concatenate([x, numpy.repeat(numpy.repeat(0., input_dimension - 2)[numpy.newaxis, :], len(x), axis=0)], axis=1)
x = numpy.matmul(x, rotation_matrix)
x = torch.tensor(x, dtype=torch.float)
y = torch.tensor([0., 1., 0., 1., 1.]).unsqueeze(1)

loss_fn = torch.nn.BCEWithLogitsLoss(reduction='none')
pred = model(x)
loss = loss_fn(pred, y)
#loss.backward()

In [None]:
import matplotlib.pyplot

histogram_resolution = 20

def histogram_bars(histogram_frequencies, histogram_bins):
    histogram_bins = histogram_bins.detach().cpu().numpy()[:-1]
    histogram_bins_pace = min(histogram_bins[1] - histogram_bins[0], 0.01)
    histogram_bins += histogram_bins_pace / 2.
    histogram_frequencies = histogram_frequencies.detach().cpu().numpy()
    return histogram_bins, histogram_frequencies / histogram_frequencies.sum(), histogram_bins_pace

fig, (activations_ax, loss_ax, scatter_ax) = matplotlib.pyplot.subplots(1, 3, figsize=(18, 6))

activations_ax.set_title('Activations')
activations_ax.set_xlabel('activation L2 norm')
activations_ax.set_ylabel('% samples')

loss_ax.set_title('Loss')
loss_ax.set_xlabel('loss norm')
loss_ax.set_ylabel('% samples')

scatter_ax.set_title('Activations vs Loss')
scatter_ax.set_xlabel('activation L2 norm')
scatter_ax.set_ylabel('loss')

for cluster_index, cluster in zip(inputs_cluster.unique().tolist(), [[1., 0.], [0., 1.], [-1., 0.], [0., -1.]]):
    activations_ax.bar(*histogram_bars(*torch.histogram(data[inputs_cluster==cluster_index][:, 0], histogram_resolution)), label=f'cluster {cluster}', alpha=0.3)
    loss_ax.bar(*histogram_bars(*torch.histogram(data[inputs_cluster==cluster_index][:, 1], histogram_resolution)), label=f'cluster {cluster}', alpha=0.3)
    scatter_ax.scatter(data[inputs_cluster==cluster_index][:, 0].detach().numpy(), data[inputs_cluster==cluster_index][:, 1].detach().numpy(), label=f'cluster {cluster}')

for ax in (activations_ax, loss_ax, scatter_ax): ax.legend()

fig

In [78]:
import torch


input_dimension = 2
hidden_units = 2

class TwoLayerNeuralNet(torch.nn.Module):

    def __init__(self, *args, **kwargs):
        super(TwoLayerNeuralNet, self).__init__()
        self.input_layer = torch.nn.Parameter(torch.tensor([[1., 0.],
                                                            [0., 1.]], requires_grad=True))
        self.activation_fn = torch.nn.ReLU()
        self.output_layer = torch.tensor([-1., 1.])
        self.dummy_variable = torch.zeros(hidden_units, hidden_units, requires_grad=True)
        self.dummy_variable.retain_grad()
        
    def forward(self, x):
        self.pre_activations = x.mm(self.input_layer).requires_grad_()
        self.pre_activations.retain_grad()
        
        self.activations = self.activation_fn(self.pre_activations).requires_grad_()
        self.activations.retain_grad()
        
        dummy_term = self.activations.unsqueeze(1).bmm(self.activations.mm(self.dummy_variable).reshape(-1, hidden_units, 1)).reshape(-1)
        
        self.pred = torch.matmul(self.activations, self.output_layer) + dummy_term
        self.pred.retain_grad()
        return self.pred

model = TwoLayerNeuralNet()
initial_input_layer = model.input_layer.data.detach().cpu().numpy().tolist()
initial_output_layer = model.output_layer.data.detach().cpu().numpy().tolist()

optimizer = torch.optim.SGD(model.parameters(), lr=1.)

x = torch.tensor([[1., 0.], [0., 1.], [-1., 0.], [0., -1.], [-1., -1.]])
y = torch.tensor([0., 1., 0., 1., 1.])

loss_fn = torch.nn.BCEWithLogitsLoss(reduction='none')
pred = model(x)
loss = loss_fn(pred, y)

#optimizer.zero_grad()
#loss.backward()
#optimizer.step()

final_input_layer = model.input_layer.data.detach().cpu().numpy().tolist()
final_output_layer = model.output_layer.data.detach().cpu().numpy().tolist()

In [84]:
loss = torch.nn.BCEWithLogitsLoss(reduction='none')(torch.Tensor([0., 0.]), torch.Tensor([0., 1.]))

In [85]:
loss

tensor([0.6931, 0.6931])

In [86]:
loss.item()

RuntimeError: a Tensor with 2 elements cannot be converted to Scalar

In [83]:
loss_fn.reduction

'none'

In [79]:
loss

tensor([0.3133, 0.3133, 0.6931, 0.6931, 0.6931],
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [76]:
loss

tensor(0.5412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [68]:
initial_input_layer

[[1.0, 0.0], [0.0, 1.0]]

In [69]:
final_input_layer

[[1.0, 0.0], [0.0, 1.0]]

In [70]:
model.pre_activations.grad.sum(axis=0)

tensor([0., 0.])

In [71]:
model.pre_activations.grad.sum(axis=0).norm()

tensor(0.)

In [72]:
model.activations.grad.sum(axis=0)

tensor([-0.5000,  0.5000])

In [73]:
model.activations.grad.sum(axis=0).norm()

tensor(0.7071)

In [74]:
model.pred.grad

tensor([-0.5000])

In [None]:

dummy_variable.grad
activations.reshape(-1, hidden_units, 1).bmm(activations.reshape(-1, 1, hidden_units))
numpy.linalg.norm((dummy_variable.grad / max_eigenvalue).detach().cpu().numpy()) ** 2

In [40]:
torch.nn.functional.softmax(pred)

  torch.nn.functional.softmax(pred)


tensor([0.3333, 0.3333, 0.3333], grad_fn=<SoftmaxBackward0>)

In [38]:
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

sigmoid(1) * (1-sigmoid(1))

0.09830596662074093