# Exercise 1: Initializing all weights and biases to zero

I did not get around to seeing what happens when you initialize all weights and biases to zero. Try this and train the neural net. You might think either that 1) the network trains just fine or 2) the network doesn't train at all, but actually it is 3) the network trains but only partially, and achieves a pretty bad final performance. Inspect the gradients and activations to figure out what is happening and why the network is only partially training, and what part is being trained exactly.

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [2]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
# build the vocabulary of characters and mappings to/from integers

chars = sorted(set(''.join(words)))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [20]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182580, 3]) torch.Size([182580])
torch.Size([22767, 3]) torch.Size([22767])
torch.Size([22799, 3]) torch.Size([22799])


In [72]:
# MLP revisited
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
W1 = torch.zeros((n_embd * block_size, n_hidden))
b1 = torch.zeros(n_hidden,                      ) 
W2 = torch.zeros((n_hidden, vocab_size),        ) 
b2 = torch.zeros(vocab_size,                    ) 
parameters = [C, W1, b1, W2, b2]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

print(W1, b1, W2, b2)

11897
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True) tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [73]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

    # forward pass
    emb = C[Xb] # embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
    hpreact = embcat @ W1 + b1 # hidden layer pre-activation
    # print(hpreact)
    h = torch.tanh(hpreact) # (32, 100)
    # print(h)
    logits = h @ W2 + b2 # (32, 27)
    # print(logits)
    loss = F.cross_entropy(logits, Yb)
    # print(loss)
    #print(loss.item())

    # backward pass
    print('pre backward pass b2.grad', b2.grad)
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    #lr = lrs[i]
    lr = 0.1 if i < 100000 else 0.01
    print('post backward pass b2.grad', b2.grad)
    print('b2.data', b2.data)
    for p in parameters:
        p.data += -lr * p.grad


    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

    break


print('hpreact:', hpreact) 
print('h:', h)
print('logits:', logits)
print('W1:', W1)
print('W1.grad', W1.grad)
print('b1:', b1)
print('b1.grad', b1.grad)
print('W2:', W2)
print('W2.grad', W2.grad)
print('b2:', b2)
print('b2.grad', b2.grad)

pre backward pass b2.grad None
post backward pass b2.grad tensor([-0.2130,  0.0058,  0.0370, -0.0255,  0.0370,  0.0058,  0.0370, -0.0255,
         0.0370, -0.1505,  0.0058,  0.0370, -0.0255,  0.0058, -0.0567,  0.0370,
         0.0370,  0.0370,  0.0370,  0.0370, -0.0255,  0.0370, -0.0255,  0.0370,
         0.0370,  0.0058,  0.0370])
b2.data tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.])
      0/ 200000: 3.2958
hpreact: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<AddBackward0>)
h: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
      

### ANSWER

Only the b2 values (the biases on the last layer) are being updated because during backward pass, the gradients of the loss are passed down to the children nodes, since the operation is addition (self.grad += 1.0 * out.grad). However, when calculating the gradient for W2, the gradients become zero, as the gradient becomes self.grad += other.data * out.grad # x1.grad = w1 * x1w1.grad, in which case self.grad becomes zero, as other.data is all zeros. Therefore, the backpropogation stops there and only updates the b2 values on each cycle.

# Exercise 2: Removing the BatchNorm layers during testing

BatchNorm, unlike other normalization layers like LayerNorm/GroupNorm etc. has the big advantage that after training, the batchnorm gamma/beta can be "folded into" the weights of the preceeding Linear layers, effectively erasing the need to forward it at test time. Set up a small 3-layer MLP with batchnorms, train the network, then "fold" the batchnorm gamma/beta into the preceeding Linear layer's W,b by creating a new W2, b2 and erasing the batch norm. Verify that this gives the same forward pass during inference. i.e. we see that the batchnorm is there just for stabilizing the training, and can be thrown out after training is done! pretty cool.

In [124]:
class Linear:
  
  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in**0.5
    self.bias = torch.zeros(fan_out) if bias else None

# --------------------------------------NEW--------------------------------------
    
  def setParams(self, W, b):
      self.weight = W
      self.bias = b
      
# --------------------------------------NEW--------------------------------------
  
  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out
  
  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      xmean = x.mean(0, keepdim=True) # batch mean
      xvar = x.var(0, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  # def __fold__(self, w, w_prime, b, b_prime)
  #   w_prime = w * (self.gamma/torch.sqrt(self.running_var + self.eps))
  #   b_prime = self.beta - ((self.gamma * self.running_mean)/torch.sqrt(self.running_var + self.eps))
  #   return w_prime, b_prime
  
  def parameters(self):
    return [self.gamma, self.beta]

class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []

# --------------------------------------NEW--------------------------------------

class FoldBatchNorm:
    def __call__(self, linear, batchnorm):
        W_prime = linear.weight * (batchnorm.gamma/torch.sqrt(batchnorm.running_var + batchnorm.eps))
        b_prime = batchnorm.beta - ((batchnorm.gamma * batchnorm.running_mean)/torch.sqrt(batchnorm.running_var + batchnorm.eps))
        return W_prime, b_prime
    def parameters(self):
        return[W_prime, b_prime]
        
# --------------------------------------NEW--------------------------------------

n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 100 # the number of neurons in the hidden layer of the MLP
g = torch.Generator().manual_seed(2147483647) # for reproducibility

C = torch.randn((vocab_size, n_embd),            generator=g)
L1 = Linear(n_embd * block_size, n_hidden, bias=False)
B1 = BatchNorm1d(n_hidden)
L2 = Linear(n_hidden, n_hidden, bias=False)
B2 = BatchNorm1d(n_hidden)
L3 =  Linear(           n_hidden, vocab_size, bias=False)
B3 = BatchNorm1d(vocab_size)


layers = [
# --------------------------------------NEW--------------------------------------

    L1, B1, Tanh(), L2, B2, Tanh(), L3, B3

# --------------------------------------NEW--------------------------------------
  # Linear(n_embd * block_size, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  # Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  # Linear(           n_hidden, vocab_size, bias=False), BatchNorm1d(vocab_size),
  # Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  # Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  # Linear(           n_hidden, vocab_size, bias=False), BatchNorm1d(vocab_size),
]

with torch.no_grad():
  # last layer: make less confident
  layers[-1].gamma *= 0.1
  #layers[-1].weight *= 0.1
  # all other layers: apply gain
  for layer in layers[:-1]:
    if isinstance(layer, Linear):
      layer.weight *= 1.0 #5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

16424


In [118]:
# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []
ud = []

for i in range(max_steps):
  
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
  
  # forward pass
  emb = C[Xb] # embed the characters into vectors
  x = emb.view(emb.shape[0], -1) # concatenate the vectors
  for layer in layers:
    x = layer(x)
  loss = F.cross_entropy(x, Yb) # loss function
  
  # backward pass
  for layer in layers:
    layer.out.retain_grad() # AFTER_DEBUG: would take out retain_graph
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())
  with torch.no_grad():
    ud.append([((lr*p.grad).std() / p.data.std()).log10().item() for p in parameters])

  # if i >= 1000:
  #   break # AFTER_DEBUG: would take out obviously to run full optimization

      0/ 200000: 3.3206
  10000/ 200000: 2.0220
  20000/ 200000: 1.8240
  30000/ 200000: 2.3696
  40000/ 200000: 2.1317
  50000/ 200000: 1.9742
  60000/ 200000: 2.0142
  70000/ 200000: 2.0138
  80000/ 200000: 2.0901
  90000/ 200000: 1.9777
 100000/ 200000: 2.2713
 110000/ 200000: 2.4479
 120000/ 200000: 2.5790
 130000/ 200000: 1.8881
 140000/ 200000: 1.8387
 150000/ 200000: 1.9365
 160000/ 200000: 1.8594
 170000/ 200000: 2.0975
 180000/ 200000: 1.9352
 190000/ 200000: 2.1045


In [116]:

# --------------------------------------NEW--------------------------------------

fold_bn = FoldBatchNorm()

W1_prime, b1_prime = fold_bn(L1, B1)
W2_prime, b2_prime = fold_bn(L2, B2)
W3_prime, b3_prime = fold_bn(L3, B3)

L1.setParams(W1_prime, b1_prime)
L2.setParams(W2_prime, b2_prime)
L3.setParams(W3_prime, b3_prime)

layers = [L1, Tanh(), L2, Tanh(), L3]

# --------------------------------------NEW--------------------------------------

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  emb = C[x] # (N, block_size, n_embd)
  x = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
  for layer in layers:
    x = layer(x)
  loss = F.cross_entropy(x, y)
  print(split, loss.item())

# put layers into eval mode
for layer in layers:
  layer.training = False
split_loss('train')
split_loss('val')

train 2.0478317737579346
val 2.1033167839050293


In [123]:
@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  emb = C[x] # (N, block_size, n_embd)
  x = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
  for layer in layers:
    x = layer(x)
  loss = F.cross_entropy(x, y)
  print(split, loss.item())

# put layers into eval mode
for layer in layers:
  layer.training = False
split_loss('train')
split_loss('val')

train 2.0478317737579346
val 2.1033167839050293


**The two losses are the same in both training methods above.**

### ANSWER:

I was able to fold the batchnorm layer into the previous weight layer by taking the batchnorm equation and substituting the linear layer equation (Wx + b) for the x value.

Normal BatchNorm Layer: y = (x - mean)/sqrt(variance + epsilon) * gamma + beta
Folded BatchNorm Layer: y = gamma * ((Wx+b) - mean)/sqrt(variance + epsilon) + beta

When we reformat this equation to be in the form of Wx + b, we get:

W' = W * (gamma/sqrt(variance + epsilon))

b' = beta - ((gamma * mean)/sqrt(variance + epsilon))

If we apply this transformation to each linear layer and get rid of the batchnorm layers completely, we get the exact same train and val losses of 2.0478317737579346 and 2.1033167839050293, respectively.