In [1]:
import torch
import torch.nn as nn

layer = nn.Linear(40, 10)

layer.weight.data *= 6 ** 0.5
torch.zero_(layer.bias.data)

# Initializing random weights through kaiming he transformation by input features

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [2]:
nn.init.kaiming_uniform_(layer.weight)
nn.init.zeros_(layer.bias)

# same as we did before to change to kaiming he for relu activations only as it cancel out 50% of neurons if it is less than or equal to 0

Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)

In [3]:
def use_he_init(module):
  if isinstance(module, nn.Linear):
    nn.init.kaiming_uniform_(module.weight)
    nn.init.zeros_(module.bias)

model = nn.Sequential(nn.Linear(50, 40), nn.ReLU(), nn.Linear(40,1), nn.ReLU())
model.apply(use_he_init)


Sequential(
  (0): Linear(in_features=50, out_features=40, bias=True)
  (1): ReLU()
  (2): Linear(in_features=40, out_features=1, bias=True)
  (3): ReLU()
)

In [4]:
alpha = 0.2

model = nn.Sequential(
    nn.Linear(50, 40),
    nn.LeakyReLU(negative_slope = alpha)
)

nn.init.kaiming_uniform_(model[0].weight, alpha, nonlinearity = 'leaky_relu')

# Leaky ReLU used to reduce vanishing gradient if the linear output gives too much negative outputs by setting it up using alpha parameter to get slight slope over 0

Parameter containing:
tensor([[-0.0385,  0.2261,  0.0679,  ...,  0.1450,  0.2352,  0.2870],
        [-0.0147, -0.2404, -0.3003,  ...,  0.0973, -0.1113, -0.2559],
        [ 0.3144,  0.2840,  0.1420,  ...,  0.2951,  0.1464,  0.0474],
        ...,
        [-0.1000,  0.1273, -0.0369,  ...,  0.1679, -0.1796,  0.1265],
        [ 0.0923,  0.2405,  0.0134,  ..., -0.0158, -0.1935,  0.0381],
        [-0.1459,  0.3137,  0.3157,  ...,  0.1535,  0.0735,  0.3239]],
       requires_grad=True)

In [5]:
model = nn.Sequential(
    nn.Flatten(),
    nn.BatchNorm1d(1 * 28 * 28),
    nn.Linear(1 * 28 * 28, 300),
    nn.ReLU(),
    nn.BatchNorm1d(300),
    nn.Linear(300, 100),
    nn.ReLU(),
    nn.BatchNorm1d(100),
    nn.Linear(100, 10)
)

# we used Batchnorm to normalization of values and dont needed external normalization

In [6]:
dict(model[1].named_parameters()).keys()

dict_keys(['weight', 'bias'])

In [7]:
dict(model[1].named_buffers()).keys()

# The named buffer parameter contains the currently trained mean, variance and batch count of the batchnorm training until now

dict_keys(['running_mean', 'running_var', 'num_batches_tracked'])

In [8]:
model = nn.Sequential(
    nn.Flatten(),
    nn.BatchNorm1d(1 * 28 * 28),
    nn.Linear(1 * 28 * 28, 300, bias = False),
    nn.BatchNorm1d(300),
    nn.ReLU(),
    nn.Linear(300, 100, bias = False),
    nn.BatchNorm1d(100),
    nn.ReLU(),
    nn.Linear(100, 10)
)

# Here we used batchnorm1d before ReLU activation as it normalize the layer values to center around mean 0 which gives uniformly distributed instances
# We need to remove the bias during linear layer as it assigns bias term for each instance which also again given at batchnorm function so it can reduce computation again

In [9]:
data = torch.tensor([
    [1.0, 2.0, 3.0],
    [100.0, 200.0, 300.0]],
    dtype = torch.float32)

print(data)

tensor([[  1.,   2.,   3.],
        [100., 200., 300.]])


In [10]:
bn = nn.BatchNorm1d(num_features=3)
bn.train()
out_bn = bn(data)
print(out_bn)

# Batchnorm chooses the column wise minimum value if both are positive or negative

tensor([[-1.0000, -1.0000, -1.0000],
        [ 1.0000,  1.0000,  1.0000]], grad_fn=<NativeBatchNormBackward0>)


In [11]:
ln = nn.LayerNorm(normalized_shape = 3)
out_ln = ln(data)
print(out_ln)
# Layernorm normalize the row instance by calculating againts the mean value from the vector so 2 is average then it becomes 0 as middle point
# Calculates seperately for each instances

tensor([[-1.2247,  0.0000,  1.2247],
        [-1.2247,  0.0000,  1.2247]], grad_fn=<NativeLayerNormBackward0>)


In [12]:
inputs = torch.randn(32, 3, 100, 200)
layer_norm = nn.LayerNorm([100, 200])
result = layer_norm(inputs)
print(result.shape)

# For a tensor image with batch 32 , channel 3 rgb and height and width of 100 X 200 we have to specify the layer_normalization dimension to normalize

torch.Size([32, 3, 100, 200])


In [13]:
layer_norm = nn.LayerNorm([3, 100, 200])
result = layer_norm(inputs)

# Now this will calculate the dimensions from channel like Red,gree and blue as a whole single image rather than seperate colors

In [14]:
import torch.optim as optim

model = nn.LSTM(input_size = 10, hidden_size= 20, num_layers = 2)
optimizer = optim.Adam(model.parameters(), lr = 0.01)

criterion = nn.MSELoss()

input_seq = torch.randn(50, 32, 10)
target = torch.randn(50, 32, 20)

model.train()

output, _ = model(input_seq)
loss = criterion(output, target)

optimizer.zero_grad()
loss.backward()

grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
print(f"Total norm Before clipping:{grad_norm:.4f}")

optimizer.step()


# Grad Clipping used to rescale the output grad of each layer without changing direction during backpropagation by assigning between fixed scale
# This is particularly very usefull in LSTM and RNN as the neurons pass through each other seevral times which may cause exploading gradients


Total norm Before clipping:0.0313
