In [5]:
import torch
import sklearn.datasets as skds
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
import torchvision
from torch import nn
import numpy

# Define model and custom layers

In [6]:
class MyCustomLayer(nn.Module):
	def __init__(self, size_in, size_out):
		super().__init__()

		self.size_in, self.size_out = size_in, size_out
		self.weights = nn.Parameter(torch.Tensor(size_out, size_in))  # nn.Parameter is a Tensor that's a module parameter.
		self.bias = nn.Parameter(torch.Tensor(size_out))

		self.linear1 = nn.Linear(size_in, size_out)
		self.activation = nn.ReLU(inplace=True) # inplace = don't use extra memory

		# initialize weights and biases
		self.apply(self.weights_init_normal)
		
	
	def forward(self, x):
		# 1. X is batch of the samples, as input
		# Note to self: X is 2 dimensional: First dim is the batch size, second dim is the number of features!
		#x = torch.flatten(x) # We don't need to flatten, since dim=0 is the batch size, and we only care about features

		# 2. Split input into two tensors with same length (we can assume that the length of X is even)
		assert x.shape[0] % 2 == 0
		x1, x2 = torch.tensor_split(x, 2, dim=1) # Split the features dimension
		assert x1.size() == x2.size()

		# 3. Put two tensors into the same aggregation layer (linear, for example)
		x1 = self.linear1(x1)
		x2 = self.linear1(x2)

		# 3. Then put into activation layer
		x1 = self.activation(x1)
		x2 = self.activation(x2)

		# 4. Concatinate two halves to create output Y
		#Y = torch.cat((x1, x2))
		#print(Y.size())
		Y = (x1+x2)/2
		return Y
	
	def weights_init_normal(self, m):
		'''Takes in a module and initializes all linear layers with weight values taken from a normal distribution.'''

		classname = m.__class__.__name__
		# for every Linear layer in a model
		if classname.find('Linear') != -1:
			y = m.in_features
			# m.weight.data shoud be taken from a normal distribution
			m.weight.data.normal_(0.0,1/numpy.sqrt(y))
			# m.bias.data should be 0
			m.bias.data.fill_(0)
		
			print("bias = ", m.bias.data)
			print("weight = ", m.weight.data)
			print("weight mean: ", m.weight.data.mean(), "min: ", torch.min(m.weight.data), "max: ", torch.max(m.weight.data))

model = nn.Sequential(
	MyCustomLayer(392, 20),
	nn.Softmax()
)

print(model)


bias =  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
weight =  tensor([[-9.5368e-02,  9.6251e-02,  5.0826e-03,  ...,  5.1594e-02,
         -1.0958e-01,  2.5136e-02],
        [-4.0485e-02,  5.3265e-02,  5.3008e-02,  ...,  1.1843e-02,
          3.2150e-02,  4.6764e-02],
        [ 1.4811e-03,  2.1794e-02, -2.0335e-02,  ..., -2.0085e-02,
          6.2974e-02, -2.9049e-02],
        ...,
        [ 4.0089e-02, -3.0543e-02,  9.3759e-02,  ...,  8.1432e-02,
         -4.5846e-02, -6.9860e-02],
        [ 5.9842e-02,  5.7093e-03,  5.6555e-03,  ..., -3.9907e-03,
          2.1229e-02,  6.9627e-02],
        [ 5.6902e-05, -3.7168e-02,  1.8137e-02,  ..., -1.7993e-03,
         -9.8354e-02, -5.4815e-03]])
weight mean:  tensor(-0.0004) min:  tensor(-0.2025) max:  tensor(0.1838)
Sequential(
  (0): MyCustomLayer(
    (linear1): Linear(in_features=392, out_features=20, bias=True)
    (activation): ReLU(inplace=True)
  )
  (1): Softmax(dim=None)
)


# Initialize parameters - link

https://stackoverflow.com/a/55546528/5854499


# Load train data

In [7]:
# train_data_transformed = torchvision.datasets.FashionMNIST( 
#     root="/22961", train=True, download=False, 
#     transform=torchvision.transforms.PILToTensor())
# train_dataloader = DataLoader(train_data_transformed, batch_size=4)

# trans=torchvision.transforms.Compose(
#     [torchvision.transforms.PILToTensor(),
#      torchvision.transforms.ConvertImageDtype(torch.float)])
# train_data_transformed = torchvision.datasets.FashionMNIST(
#     root="/22961", train=True, download=True,
#     transform=trans)

dataset = torchvision.datasets.FashionMNIST(root="/22961", train=True, download=True, transform=torchvision.transforms.PILToTensor())
dataloader = DataLoader(dataset, batch_size=10)

# Define cost and optimizer functions

We use Negative Log Likelihood Loss (NLLLoss), which is our Cross Entropy function

And we use classic SGD optimizer to find minimum

In [8]:
CE_loss=torch.nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# Define batch iterative function

This function is called for each batch

In [9]:
def iterate_batch(idx):
	imgs, labels = next(iter(dataloader))
	imgs = imgs.flatten(start_dim=1)
	optimizer.zero_grad()
	y_model=model(imgs.float())

	loss=CE_loss(y_model,labels)
	loss.backward()
	optimizer.step()

	predicted_labels = y_model.argmax(dim=1)
	acc = (predicted_labels == labels).sum()/len(labels)
	return loss.detach(), acc.detach()

# Train

In [10]:
from tqdm import tqdm
batches=len(dataloader)
print("num of batches: \n", batches)
batch_loss=torch.zeros(batches)
batch_acc=torch.zeros(batches)
for idx in tqdm(range(batches)):
	batch_loss[idx], batch_acc[idx] = iterate_batch(idx)

num of batches: 
 6000


  input = module(input)
100%|██████████| 6000/6000 [00:07<00:00, 795.31it/s]


# Block diagram of the NN

![](./drawio/Untitled%20Diagram.drawio.png)

# Number of parameters

In [11]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(get_n_params(model))
print(count_parameters(model))

15720
15720


### Manual calculation

```
Hidden layer:

392*20 + 392*20 = 7840 weights
Also, we have 20 output neurons: 20 neurons means 20 biases (one bias for each Y_0, Y_1, ..., Y_19 equation)

Activation layer:
0 (no weights, no bias, ReLU is max(0, X) so no additional parameters except from the input to the function itself)


Summary:
7840 + 0 + 20 = 7860
```

### Compared to built-in layers of PyTorch

It's the same, but I calculated manually diffirently

In [12]:
torch_model = torch.nn.Sequential(
	torch.nn.Linear(392, 20),
	torch.nn.Softmax()
)
print(get_n_params(model))
print(count_parameters(model))

15720
15720


In [13]:
from prettytable import PrettyTable
def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

count_parameters(model)
print(model)

count_parameters(torch_model)
print(torch_model)

+------------------+------------+
|     Modules      | Parameters |
+------------------+------------+
|    0.weights     |    7840    |
|      0.bias      |     20     |
| 0.linear1.weight |    7840    |
|  0.linear1.bias  |     20     |
+------------------+------------+
Total Trainable Params: 15720
Sequential(
  (0): MyCustomLayer(
    (linear1): Linear(in_features=392, out_features=20, bias=True)
    (activation): ReLU(inplace=True)
  )
  (1): Softmax(dim=None)
)
+----------+------------+
| Modules  | Parameters |
+----------+------------+
| 0.weight |    7840    |
|  0.bias  |     20     |
+----------+------------+
Total Trainable Params: 7860
Sequential(
  (0): Linear(in_features=392, out_features=20, bias=True)
  (1): Softmax(dim=None)
)
