# makemore: part 3

- E01: I did not get around to seeing what happens when you initialize all weights and biases to zero. Try this and train the neural net. You might think either that 1) the network trains just fine or 2) the network doesn't train at all, but actually it is 3) the network trains but only partially, and achieves a pretty bad final performance. Inspect the gradients and activations to figure out what is happening and why the network is only partially training, and what part is being trained exactly.

Since the weights are all 0, the inputs to the activation function will be 0. However, evaluating tanh at 0 yields 0, so if we perform backpropagation, and calculate the gradient, we get 0. This means that the weights will not be updated, and the network will not learn anything. Only the bias of the final layer can actually learn.

In [170]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [171]:
# read in all the words
words = open('../names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [172]:
len(words)

32033

In [180]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [203]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%


torch.Size([182437, 3]) torch.Size([182437])
torch.Size([22781, 3]) torch.Size([22781])
torch.Size([22928, 3]) torch.Size([22928])


In [204]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility

In [205]:
# The classes we create here are the same API as nn.Module in PyTorch

class Linear:
  
  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in**0.5
    self.bias = torch.zeros(fan_out) if bias else None
  
  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out
  
  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])


class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      xmean = x.mean(0, keepdim=True) # batch mean
      xvar = x.var(0, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []

In [206]:
def print_parameters(epoch, layers):
  print(f'epoch {epoch}')
  if epoch == 0:
    for i, layer in enumerate(layers):
      if isinstance(layer, Linear):
        print(f'layer {i} ({layer.__class__.__name__})')
        print(f'  weight: {layer.weight[0][0]}')
        print(f'  bias: {layer.bias[0]}')
      if isinstance(layer, BatchNorm1d):
        print(f'layer {i} ({layer.__class__.__name__})')
        print(f'  gamma: {layer.gamma[0]}')
        print(f'  beta: {layer.beta[0]}')
      if isinstance(layer, Tanh):
        print(f'layer {i} ({layer.__class__.__name__})')
    print('\n')
  else:
    for i, layer in enumerate(layers):
      if isinstance(layer, Linear):
        print(f'layer {i} ({layer.__class__.__name__})')
        print(f'  weight: {layer.weight[0][0]}')
        print(f'  weight_grad: {layer.weight.grad[0][0]}')
        print(f'  bias: {layer.bias[0]}')
        print(f'  bias_grad: {layer.bias.grad[0]}')
        print(f' output: {layer.out[0][0]}')
        print(f' output_grad: {layer.out.grad[0][0]}')
      if isinstance(layer, BatchNorm1d):
        print(f'layer {i} ({layer.__class__.__name__})')
        print(f'  gamma: {layer.gamma[0]}')
        print(f'  gamma_grad: {layer.gamma.grad[0]}')
        print(f'  beta: {layer.beta[0]}')
        print(f'  beta_grad: {layer.beta.grad[0]}')
      if isinstance(layer, Tanh):
        print(f'layer {i} ({layer.__class__.__name__})')
        print(f' output: {layer.out[0][0]}')
        print(f' output_grad: {layer.out.grad[0][0]}')
    print('\n')

In [207]:
# Let's train a deeper network

n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 100 # the number of neurons in the hidden layer of the MLP
g = torch.Generator().manual_seed(2147483647) # for reproducibility

C = torch.randn((vocab_size, n_embd),            generator=g)
# layers = [
#   Linear(n_embd * block_size, n_hidden, bias=True), BatchNorm1d(n_hidden), Tanh(),
#   Linear(           n_hidden, n_hidden, bias=True), BatchNorm1d(n_hidden), Tanh(),
#   Linear(           n_hidden, n_hidden, bias=True), BatchNorm1d(n_hidden), Tanh(),
#   Linear(           n_hidden, n_hidden, bias=True), BatchNorm1d(n_hidden), Tanh(),
#   Linear(           n_hidden, n_hidden, bias=True), BatchNorm1d(n_hidden), Tanh(),
#   Linear(           n_hidden, vocab_size, bias=True), BatchNorm1d(vocab_size),
# ]
layers = [
  Linear(n_embd * block_size, n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden), 
]

with torch.no_grad():
  # all other layers: apply gain
  for layer in layers:
    if isinstance(layer, Linear):
      layer.weight *= 0.0 #1.0 #5/3
      layer.bias *= 0.0

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

13470


In [208]:
# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []
ud = []

for i in range(max_steps):

  print_parameters(i, layers)
  
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
  
  # forward pass
  emb = C[Xb] # embed the characters into vectors
  x = emb.view(emb.shape[0], -1) # concatenate the vectors
  for layer in layers:
    x = layer(x)
  loss = F.cross_entropy(x, Yb) # loss function
  
  # backward pass
  for layer in layers:
    layer.out.retain_grad() # AFTER_DEBUG: would take out retain_graph
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  lr = 0.1 
  for p in parameters:
    p.data += -lr * p.grad
  
  if i >= 1:
    break # AFTER_DEBUG: would take out obviously to run full optimization

epoch 0
layer 0 (Linear)
  weight: -0.0
  bias: 0.0
layer 1 (Tanh)
layer 2 (Linear)
  weight: 0.0
  bias: 0.0


epoch 1
layer 0 (Linear)
  weight: -0.0
  weight_grad: 0.0
  bias: 0.0
  bias_grad: 0.0
 output: 0.0
 output_grad: 0.0
layer 1 (Tanh)
 output: 0.0
 output_grad: 0.0
layer 2 (Linear)
  weight: 0.0
  weight_grad: 0.0
  bias: 0.024000002071261406
  bias_grad: -0.24000000953674316
 output: 0.0
 output_grad: 0.0003124999930150807




One can see how the gradients are squished by the tanh layer in comparison with the 0 weight gradient from the linear layer. Only the bias of the last linear layer can learn.