Basically the same as the code from video 6, but only trained on unsong. The training data is the raw text of unsong, with most special characters removed. Everything written in lower case and each line as one individual, thing to be looked at. I also removed very short and long lines.

In [5]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

Setting Hyper-Parameters

In [6]:
min_line_len = 10
max_line_len = 120

block_size = 16
# context length: how many characters do we take to predict the next one?

n_embd = 16  # the dimensionality of the character embedding vectors
n_hidden = 128  # the number of neurons in the hidden layer of the MLP
max_steps = 85000
batch_size = 64

In [7]:
# read in all the lines
all_lines = open("clean_unsong.txt", "r").read().splitlines()
print(len(all_lines))
print(max(len(line) for line in all_lines))
print(all_lines[:8])

27591
210
['', 'prologue', 'i', 'in retrospect, there had been omens and portents.', 'we are now approaching lunar sunrise, said william anders,', 'and for all the people back on earth, the crew of apollo  has a message', 'that we would like to send to you.', 'rivers flowed uphill. a new star was seen in the night sky. a']


In [8]:
lines = []

for line in all_lines:
    if min_line_len < len(line) < max_line_len:
        lines.append(line)

In [9]:
len(lines)

22354

In [10]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set("".join(lines))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['-'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: ' ', 2: ',', 3: '.', 4: '?', 5: 'a', 6: 'b', 7: 'c', 8: 'd', 9: 'e', 10: 'f', 11: 'g', 12: 'h', 13: 'i', 14: 'j', 15: 'k', 16: 'l', 17: 'm', 18: 'n', 19: 'o', 20: 'p', 21: 'q', 22: 'r', 23: 's', 24: 't', 25: 'u', 26: 'v', 27: 'w', 28: 'x', 29: 'y', 30: 'z', 0: '-'}
31


In [11]:
# shuffle up the words
import random

random.seed(42)
random.shuffle(lines)

In [12]:
# build the dataset
def build_dataset(lines):
    X, Y = [], []

    for w in lines:
        context = [0] * block_size
        for ch in w + '-':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y


n1 = int(0.8 * len(lines))
n2 = int(0.9 * len(lines))
Xtr, Ytr = build_dataset(lines[:n1])  # 80%
Xdev, Ydev = build_dataset(lines[n1:n2])  # 10%
Xte, Yte = build_dataset(lines[n2:])  # 10%

torch.Size([1017351, 16]) torch.Size([1017351])
torch.Size([127001, 16]) torch.Size([127001])
torch.Size([127411, 16]) torch.Size([127411])


In [13]:
for x,y in zip(Xtr[:20], Ytr[:20]):
  print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

---------------- --> o
---------------o --> f
--------------of -->  
-------------of  --> d
------------of d --> a
-----------of da --> y
----------of day --> .
---------of day. -->  
--------of day.  --> a
-------of day. a --> l
------of day. al --> l
-----of day. all -->  
----of day. all  --> o
---of day. all o --> f
--of day. all of -->  
-of day. all of  --> t
of day. all of t --> h
f day. all of th --> e
 day. all of the --> s
day. all of thes --> e


Initialization of all the layers.

In [14]:
# Near copy paste of the layers we have developed in Part 3

# -----------------------------------------------------------------------------------------------
class Linear:

  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5 # note: kaiming init
    self.bias = torch.zeros(fan_out) if bias else None

  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

# -----------------------------------------------------------------------------------------------
class BatchNorm1d:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      if x.ndim == 2:
        dim = 0
      elif x.ndim == 3:
        dim = (0,1)
      xmean = x.mean(dim, keepdim=True) # batch mean
      xvar = x.var(dim, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

# -----------------------------------------------------------------------------------------------
class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []

# -----------------------------------------------------------------------------------------------
class Embedding:

  def __init__(self, num_embeddings, embedding_dim):
    self.weight = torch.randn((num_embeddings, embedding_dim))

  def __call__(self, IX):
    self.out = self.weight[IX]
    return self.out

  def parameters(self):
    return [self.weight]

# -----------------------------------------------------------------------------------------------
class FlattenConsecutive:

  def __init__(self, n):
    self.n = n

  def __call__(self, x):
    B, T, C = x.shape
    x = x.view(B, T//self.n, C*self.n)
    if x.shape[1] == 1:
      x = x.squeeze(1)
    self.out = x
    return self.out

  def parameters(self):
    return []

# -----------------------------------------------------------------------------------------------
class Sequential:

  def __init__(self, layers):
    self.layers = layers

  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out

  def parameters(self):
    # get parameters of all layers and stretch them out into one list
    return [p for layer in self.layers for p in layer.parameters()]


In [15]:
torch.manual_seed(42); # seed rng for reproducibility

In [282]:
# hierarchical network
model = Sequential([
  Embedding(vocab_size, n_embd),
  FlattenConsecutive(2), Linear(n_embd * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  FlattenConsecutive(2), Linear(n_hidden*2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
Linear(n_hidden, vocab_size),
])

# parameter init
with torch.no_grad():
  model.layers[-1].weight *= 0.1 # last layer make less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

107919


In [283]:
# same optimization as last time
lossi = []

for i in range(max_steps):

    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update: simple SGD
    lr = 0.1 if i < max_steps * 0.9 else 0.01  # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/  85000: 3.4286
  10000/  85000: 1.9855
  20000/  85000: 1.6178
  30000/  85000: 1.9301
  40000/  85000: 1.8694
  50000/  85000: 1.6738
  60000/  85000: 1.6048
  70000/  85000: 1.6871
  80000/  85000: 1.5885


In [3]:
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1))

NameError: name 'lossi' is not defined

In [4]:
# put layers into eval mode (needed for batchnorm especially)
for layer in model.layers:
    layer.training = False


# evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(f'{split}, {loss.item():.3f}')

split_loss('train')
split_loss('val')

NameError: name 'model' is not defined

## Sample from the Model

In [286]:
for _ in range(20):

    out = []
    context = [0] * block_size  # initialize with all ...(zeros)
    while True:
        # forward pass the neural net
        logits = model(torch.tensor([context]))
        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        # shift the context window and track the samples
        context = context[1:] + [ix]

        if ix != 0:
            out.append(ix)
        if ix == 0:
            break

    print("".join(itos[i] for i in out))  # decode and print the generated word

was the time water in made, aaron again deasing intenty, then a have one with powing sins treationah, we thuse the ground. i raw
books otherstopprises expuforest.
that the bized, and we care on the
every ger name of the for a securd talkignaphisit for
connectly. kingles. any
he carridon. suffering boin gianty. i this which siep are was ight, and appeared nathank great he
lower and they descrupse children. that he had an it. surving clouds when could an into importation, he moment. the commanded inside magnitured you cant to guys of crack not have
too a piarnued. ran clarternous.
califohs and sun a gldelf heavens in hetfferzor. im was scrusties, and got have some to
this ockepresmeek to her faunding, but thinks too captain offs undering of not started intil follow, the first cansion hir just desthee made me books, said jefile that most nos ground, letters. that
scause things with publication she sparkader, wheeld and pullen ahanial be celread
paperty that almost exogents get table the l

# Results

## Hyper-Parameters

| Run | Min line len | max line len |  block_size | n_embd | n_hidden | model       | lr  | train | val | params | max_steps | batch_size |
| --- | ---          | ---          | ---         | ---    | ---      | ---         | --- | --- | --- | ---   | ---          | ---      |
|  1  |  10          |  30          | 8           |   24   |  128     | e flbt*3 l | 0.1  | 1.794 |1.970 | 77191 | 5000 |    32|        |
|  2  |              |      60        |             |        |          |         |       |         |   |        |    |      |        |
|  3  |              |               |             |        |          |         |       |   1.707        | 1.795    |        | 25000   |      |        |
|  4  |              |               |             |     16   |          |         |       |         | 1.660    |        | 25000   |      |        |


## Outputs


### Run 1


shat as langelet
yi , inkakiag.
god to for thing morn?
a nating bres?
the krust rove.
shupielpag
pbooid, teelof ation.
ditighous hne i lated oull.
wor hdie to happere
last, i nigld remettmtion.
thatj issation.
brobel you from whilovisate.
nate to reelon.
plought the?
wastaticanly wamilve.
it to hor lest destit dom.
down turiestrone ming, thatlom onebous than
a goal an?
thy bour that sabla rabury.
his could?

![alt text](b7593670-30a3-4b1f-a3be-486470694d78.png)



### Run 2


of thaples freated by sucas sled at if ihselice yom.
waic or to on ter crous she jusu anot to be. am dodk teireens. ksobk in well kstsice of
hadan seo to khou
greard tot oth atdelled why oullnt sowq is ject dadens. the. and the comen witels the cont it ean


### Run 3

whaters phen, said a for um, happenical one seeh stop ment. oke
see sive me, screast
comma tritinal war, so sone . . .
then difta a mongs it,
to she laged.
on that. you will by mysis. i asket.
well an it. bletea of habribe going mypaary build think was a aare in mooted awaysh i he some who do vows tnen presever her tolla comtally, as into thech will gmorize big i dxank i did coung kae in i with mose the lupdr asked parsing it is poesen us. starn, whose it feels speod, i seefern to got starthy. him all,.  him his ip men. lejtion spectliel. i tosly wure a my pupia starcians spaccifally spowred heave lace. im it it,
anage of the maybe somextatifu. where balch ituelly frre fhen different
a croul . . . stohne as terrible



### Run 4

