In [1]:
with open('shakespeare.txt', 'r') as f:
  text=f.read()

In [2]:
print(f'Length of the characters in the dataset: {len(text)}')

Length of the characters in the dataset: 5458199


# Vocabulary
The vocabulary is the set of models that the model can see or emit.

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz|}~
91


# Encoding (Tokenizing)
Here we define a simple character-level encoder, and use it to encode the first
few hundred characters of the input text.

In [4]:
from typing import List

# These two just map the characters to their position in the vocabulary
_ENCODER_MAPPING = {
    character: position for position, character in enumerate(chars)
}

_DECODER_MAPPING = {
  position: character for position, character in enumerate(chars)
}

def encode(input_text: str) -> List[int]:
  """Encode a string of text to an integer vector."""
  return [_ENCODER_MAPPING[character] for character in input_text]

def decode(input_vector: List[int]) -> str:
  """Decode an integer vector into a string of text."""
  return ''.join([_DECODER_MAPPING[value] for value in input_vector])

In [5]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([5458199]) torch.int64
tensor([51, 69, 70, 80,  1, 70, 80,  1, 81, 69, 66,  1, 16, 15, 15, 81, 69,  1,
        36, 81, 66, 85, 81,  1, 67, 70, 73, 66,  1, 77, 79, 66, 80, 66, 75, 81,
        66, 65,  1, 63, 86,  1, 47, 79, 76, 71, 66, 64, 81,  1, 38, 82, 81, 66,
        75, 63, 66, 79, 68, 11,  1, 62, 75, 65,  0, 70, 80,  1, 77, 79, 66, 80,
        66, 75, 81, 66, 65,  1, 70, 75,  1, 64, 76, 76, 77, 66, 79, 62, 81, 70,
        76, 75,  1, 84, 70, 81, 69,  1, 54, 76])


## Training and Validation Set


In [6]:
_TRAINING_SPLIT_FRACTION = 0.9
_TRAINING_SPLIT_INDEX = int(_TRAINING_SPLIT_FRACTION * len(data))
training_data = data[:_TRAINING_SPLIT_INDEX]
validation_data = data[_TRAINING_SPLIT_INDEX:]

# Blocks
The block size is the maximum size of the text that can be fed into the transformer at once. When training a transformer, the entire training text isn't shoved into the transformer at once, doing so would be computationally prohibitive.

When a block of data is sampled like in the first cell below, it's actually multiple examples packed into one due to the fact that the tokens follow one another in the actual text.

Additionally, selecting a certain block size gets the transformer used to seeing context lengths of size 1 all the way to the block size. This means that the block size determines the maximum context length.

In [7]:
block_size = 8
training_data[:8 + 1]

tensor([51, 69, 70, 80,  1, 70, 80,  1, 81])

In [8]:
transformer_inputs = training_data[:block_size]
transformer_targets = training_data[1:block_size + 1]  # Targets for each position in the input
for t in range(block_size):
  context = transformer_inputs[:t + 1]
  target = transformer_targets[t]
  print(f'When the input is {context}, the target is: {target}')
print(f'These are the {block_size} examples hidden in a chunk of text of length {block_size + 1}')

When the input is tensor([51]), the target is: 69
When the input is tensor([51, 69]), the target is: 70
When the input is tensor([51, 69, 70]), the target is: 80
When the input is tensor([51, 69, 70, 80]), the target is: 1
When the input is tensor([51, 69, 70, 80,  1]), the target is: 70
When the input is tensor([51, 69, 70, 80,  1, 70]), the target is: 80
When the input is tensor([51, 69, 70, 80,  1, 70, 80]), the target is: 1
When the input is tensor([51, 69, 70, 80,  1, 70, 80,  1]), the target is: 81
These are the 8 examples hidden in a chunk of text of length 9


## Introducing the batch dimension for parallel computing on GPUs

torch.randint docs: https://pytorch.org/docs/stable/generated/torch.randint.html

In [9]:
from typing import Tuple

torch.manual_seed(1)
batch_size = 4
block_size = 8


def get_batch(split: str) -> Tuple[torch.tensor, torch.tensor]:
  """Get a batch of data containing training and target sequences.

  Choose batch_size pairs of randomly chosen input and target blocks.
  
  Args:
    split: A string specifying whether the returned batch should be from
      the training set or the validation set. "train" is checked for, while
      anything else will be considered validation.
  Returns:
    inputs, targets: Two batch_size x block_size tensors, where the i'th row
      in each tensor corresponds to the input and prediction targets.
  """
  data = training_data if split == 'train' else validation_data

  # Choose batch_size number of random offsets between 0 and the length
  # of the data - the block size.
  randomly_chosen_indices = torch.randint(len(data) - block_size, (batch_size,))
  inputs = torch.stack(
      [data[i:i + block_size] for i in randomly_chosen_indices]
  )
  targets = torch.stack(
      [data[i + 1:i + block_size + 1] for i in randomly_chosen_indices]
  )
  return inputs, targets

input_batch, target_batch = get_batch('train')
print(f'Input tensor size: {input_batch.shape}')
print(input_batch)
print(f'Target tensor size: {target_batch.shape}')
print(target_batch)

for b in range(batch_size):
  for t in range(block_size):
    context = input_batch[b, :t+1]
    target = target_batch[b, t]
    print(f'When the input is {context.tolist()}, the target is: {target}')

Input tensor size: torch.Size([4, 8])
tensor([[65,  1, 64, 62, 82, 73, 65, 79],
        [67, 62, 64, 66, 26,  0,  1,  1],
        [ 1, 70, 80,  1, 81, 69, 66,  1],
        [13,  1, 43, 66, 81,  7, 80,  1]])
Target tensor size: torch.Size([4, 8])
tensor([[ 1, 64, 62, 82, 73, 65, 79, 76],
        [62, 64, 66, 26,  0,  1,  1,  1],
        [70, 80,  1, 81, 69, 66,  1, 67],
        [ 1, 43, 66, 81,  7, 80,  1, 79]])
When the input is [65], the target is: 1
When the input is [65, 1], the target is: 64
When the input is [65, 1, 64], the target is: 62
When the input is [65, 1, 64, 62], the target is: 82
When the input is [65, 1, 64, 62, 82], the target is: 73
When the input is [65, 1, 64, 62, 82, 73], the target is: 65
When the input is [65, 1, 64, 62, 82, 73, 65], the target is: 79
When the input is [65, 1, 64, 62, 82, 73, 65, 79], the target is: 76
When the input is [67], the target is: 62
When the input is [67, 62], the target is: 64
When the input is [67, 62, 64], the target is: 66
When th

## Starting Simple: Bigram Language Model

In [10]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocabulary_size):
    super().__init__()

    # The embedding table is basically just a tensor of vocab_size x vocab_size
    self.token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)
  
  def forward(self, index, targets=None):
    # index and targets are both (B, T) tensors of integers
    # B = batch_size
    # T = time = block_size
    # C = channels = vocab_size
    # logits are the scores for the next token
    logits = self.token_embedding_table(index)  # (B,T,C)

    # https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
    # Pytorch's cross_entropy function expects a tensor of B, C, so here
    # the logits and targets tensors are reshaped to accomodate this
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss
  
  def generate(self, index, max_new_tokens):
    """Generate predictions on the characters given by index.
    
    Args:
      index: Current context of some characters in a batch. Shape BxT. This
        function's job is to extend index to be Bx(T+1), Bx(T+2), etc for
        max_new_tokens.
      max_new_tokens: How many additional tokens to predict from the given
        context.
    """
    # index is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      
      # Get the predictions for the current context
      logits, loss = self(index)

      # Focus only on the last time step
      logits = logits[:, -1, :]  # becomes (B, C)

      # Apply softmax to get the probabilities
      probs = F.softmax(logits, dim=-1)  #(B, C)

      # Sample from the distribution
      index_next = torch.multinomial(probs, num_samples=1) # (B, 1)

      # Append the sampled index to the running sequence
      index = torch.cat((index, index_next), dim=1) # (B, T+1)
      
    return index



  
bigram_language_model = BigramLanguageModel(vocab_size)
logits, loss = bigram_language_model(input_batch, target_batch)
print(logits.shape)
print(loss)

# Feeding in a 1x1 tensor with a zero in it is equivalent to passing a newline
# as the first token, or whatever the first character in the vocabulary is.
index = torch.zeros((1, 1), dtype=torch.long)

# This will be a Bx(T+1) output, and since the prior line of code passed in
# 1 batch, B is 1.
generations = bigram_language_model.generate(index, max_new_tokens=100)
generations = generations[0].tolist()

print(decode(generations))

torch.Size([32, 91])
tensor(4.9227, grad_fn=<NllLossBackward0>)

.)k6`lcb(v,}y_<tRwfKd>eRwO@c_LAU:}|z@Y70zb[-ZfD
CkcJh[O2]1L0}f@*X]Aq;`OtIO1@6D],5IOL&2L[m*#lA~x-1u
8


In [11]:
optimizer = torch.optim.AdamW(bigram_language_model.parameters(), lr=1e-3)

In [12]:
batch_size = 32
for steps in range(30000):
  input_data, target_data = get_batch('train')

  logits, loss = bigram_language_model(input_data, target_data)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.452397346496582


In [13]:
# Feeding in a 1x1 tensor with a zero in it is equivalent to passing a newline
# as the first token, or whatever the first character in the vocabulary is.
index = torch.zeros((1, 1), dtype=torch.long)

# This will be a Bx(T+1) output, and since the prior line of code passed in
# 1 batch, B is 1.
generations = bigram_language_model.generate(index, max_new_tokens=100)
generations = generations[0].tolist()

print(decode(generations))


 a   hild    geaveseast  RERelllelme  he,
 awint  shithathodsepelim  w  IRI  pithadil meblal ske   l


## The mathematical trick in self-attention

First, an inefficient case using a for loop to make each token the average of the tokens before it.

In [14]:
B, T, C = 4, 8, 2
data = torch.randn(B, T, C)

bag_of_words = torch.zeros((B, T, C))

for batch in range(B):
	for timestep in range(T):
		previous = data[batch, :timestep+1]
		bag_of_words[batch, timestep] = torch.mean(previous, 0)


In [15]:
data[0]

tensor([[-0.3996, -0.2986],
        [-0.1525, -0.5859],
        [ 0.4731,  1.4990],
        [-1.7875,  0.7657],
        [-0.7264, -1.1719],
        [ 0.7395, -1.7191],
        [-0.2197, -0.3693],
        [-0.8298,  0.1927]])

In [16]:
bag_of_words[0]

tensor([[-0.3996, -0.2986],
        [-0.2760, -0.4422],
        [-0.0263,  0.2049],
        [-0.4666,  0.3451],
        [-0.5186,  0.0417],
        [-0.3089, -0.2518],
        [-0.2961, -0.2686],
        [-0.3629, -0.2109]])

### Using matrix multiplication to speed things up

In [None]:
# This just ends up making each column of the resulting matrix
# equal to the sum of each column in b.
torch.manual_seed(42)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print(f'a=\n{a}\n-----')
print(f'b=\n{b}\n-----')
print(f'c=\n{c}\n-----')


a=
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
-----
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
-----
c=
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])
-----
