# Dataset Setup

## Dependency Setup

In [1]:
!pip3 install convokit

Collecting convokit
  Downloading convokit-3.0.0.tar.gz (183 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m174.1/183.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.2/183.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting msgpack-numpy>=0.4.3.2 (from convokit)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl (6.9 kB)
Collecting dill>=0.2.9 (from convokit)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting clean-text>=0.6.0 (from convokit)
  Downloa

In [2]:
import nltk; nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
from convokit import Corpus, download
corpus = Corpus(filename=download('movie-corpus'))

Downloading movie-corpus to /root/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
default_backend: mem


## Compile character dialogue

In [5]:
import json
from pprint import pprint

with open('../root/.convokit/downloads/movie-corpus/utterances.jsonl') as json_file:
  utterances = list(json_file)

  # Convert JSON strings to JSON, find starts of conversations, and map utterance IDs to next utterances
  convo_starts = []
  next_utterance_map = {}
  for i in range(len(utterances)):
    utterance = json.loads(utterances[i])
    if not utterance['reply-to']:
      convo_starts.append(utterance)
    utterances[i] = utterance
    next_utterance_map[utterance['reply-to']] = utterance

  # Compile character dialogue into dialogue.txt file
  with open('dialogue.txt', 'w') as output_file:
    for convo_start in convo_starts:
      current_utterance = convo_start
      while True:
        output_file.write(current_utterance['text'] + '\n')
        if current_utterance['id'] in next_utterance_map:
          current_utterance = next_utterance_map[current_utterance['id']]
        else:
          break

# Code

## Imports


In [2]:
import torch
import torch.nn as nn
from torch.nn import functional

## Data

In [3]:
with open('dialogue.txt') as dialogue_file:
  dialogue = dialogue_file.read()

char_set = sorted(list(set(dialogue)))
vocab_size = len(char_set)
print(char_set)
print(vocab_size)

['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']
94


In [4]:
char_to_token = { char:index for index, char in enumerate(char_set) }
token_to_char = { index:char for index, char in enumerate(char_set) }
encode = lambda string: [char_to_token[char] for char in string]
decode = lambda tokens: ''.join([token_to_char[token] for token in tokens])
print(encode('testing'))
print(decode(encode('testing')))

[83, 68, 82, 83, 72, 77, 70]
testing


In [5]:
dataset = torch.tensor(encode(dialogue), dtype=torch.long)
print(dataset.shape, dataset.dtype)

n = int(0.9*len(dataset))
train_data = dataset[:n]
val_data = dataset[n:]

torch.Size([17143013]) torch.int64


In [9]:
def get_batch(data_source):
  data = train_data if data_source == 'train' else val_data
  batch_starts = torch.randint(len(data) - max_context_length, (num_batches,))
  input_tokens = torch.stack([data[batch_start:batch_start+max_context_length] for batch_start in batch_starts])
  target_tokens = torch.stack([data[batch_start+1:batch_start+max_context_length+1] for batch_start in batch_starts])
  input_tokens, target_tokens = input_tokens.to(device), target_tokens.to(device)
  return input_tokens, target_tokens

input_tokens, target_tokens = get_batch('train')
print(input_tokens.shape)
print(target_tokens.shape)

torch.Size([8, 256])
torch.Size([8, 256])


## Transformer model

In [10]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()    # Puts model into evaluation mode

  for split in ['train', 'val']:
    losses = torch.zeros(n_eval_iterations)
    for k in range(n_eval_iterations):
      input_tokens, target_tokens = get_batch(split)
      logits, loss = model(input_tokens, target_tokens)
      losses[k] = loss.item()
    out[split] = losses.mean()

  model.train()   # Puts model back into training mode
  return out


class SelfAttentionHead(nn.Module):
  """ A head of self-attention """

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embedding_dimensions, head_size, bias=False)
    self.query = nn.Linear(n_embedding_dimensions, head_size, bias=False)
    self.value = nn.Linear(n_embedding_dimensions, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(max_context_length, max_context_length)))
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, x):
    # Input size: (batch, time-step, channels) a.k.a (batch, tokens, n_embedding_dimensions)
    # Output size: (batch, time-step, head_size)
    B,T,C = x.shape
    k = self.key(x) # (B,T,head_size)     Interprets/linearly projects each token's embedding as a head_size-long 1D tensor. It's basically still an embedding though, just re-interpreted from an old one into a new one.
    q = self.query(x) # (B,T,head_size)

    # Compute attention scores
    scores = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B,T,head_size) @ (B,head_size,T) -> (B,T,T)     The result of this matrix multiplication shows how the key and query embeddings for each token interact with every other token's embeddings. The values at 1x2 and 2x1 of the result shows how the key and query embeddings of tokens 1 and 2 interact, etc. The k is tranposed so that q and k can be matrix multiplied despite being the same size. The result of each mat-mul is a square though, which is fixed later using the value tensor. The part with k.shape[-1]**-0.5 is used to normalize the output scores so that it doesn't end up as a situation where once we softmax the scores, the probabilities only focus on the highest score and the other tokens aren't used at all. Also remember that k.shape[-1] is just head_size
    scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # Mask the results of performing mat-mul with future tokens so that only the results of mut-mul with earlier tokens and the current token are taken into consideration when softmaxing each row/channel vector for probabilities.
    scores = functional.softmax(scores, dim=-1) # Softmax each individual row/channel vector. This converts masked tokens from -infinity to 0 while also normalizing attention score values to values between 0 and 1.
    scores = self.dropout(scores)

    # Get output into the proper size again by matrix-multiplying with the value tensor. This step also serves to aggregate the scores into an overall head output with head_size # of values. This SA head's output will be concatenated with other SA head outputs afterwards.
    v = self.value(x) # (B,T,head_size)
    out = scores @ v # (B,T,T) @ (B,T,head_size) -> (B,T,head_size)
    return out


class MultiHeadSelfAttention(nn.Module):
  """ Multiple self-attention heads in parallel """

  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(num_heads)])
    self.linear_proj = nn.Linear(n_embedding_dimensions, n_embedding_dimensions)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, x):
    out = torch.cat([head(x) for head in self.heads], dim=-1) # num_heads * (B,T,head_size) -> (B,T,n_embedding_dimensions)  Concatenate the outputs of all the heads
    out = self.linear_proj(out) # (B,T,n_embedding_dimensions)  A linear layer to interpret/linearly project the outputs of the heads into another tensor with the same shape
    out = self.dropout(out)
    return out


class FeedForwardLayer(nn.Module):
  """ Simple multi-layer perceptron """

  def __init__(self, n_embedding_dimensions):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embedding_dimensions, 4 * n_embedding_dimensions), # The authors say to scale up to 4 * n_embedding_dimensions and then scale back down again in the original "Atttention Is All You Need" paper
        nn.ReLU(),
        nn.Linear(4 * n_embedding_dimensions, n_embedding_dimensions),
        nn.Dropout(dropout_rate)
    )

  def forward(self, x):
    return self.net(x)


class TransformerBlock(nn.Module):
  """ Transformer decoder block """

  def __init__(self, n_embedding_dimensions, n_heads):
    super().__init__()
    head_size = n_embedding_dimensions // n_heads
    self.self_attention = MultiHeadSelfAttention(n_heads, head_size) # Since we don't use any encoders, we only use this self-attention block and don't have a cross-attention block where we receive input from an encoder block
    self.feedforward = FeedForwardLayer(n_embedding_dimensions)
    self.ln1 = nn.LayerNorm(n_embedding_dimensions)
    self.ln2 = nn.LayerNorm(n_embedding_dimensions)

  def forward(self, x):
    # We normalize the inputs before each step.
    # We also add the original vectors to the results of each step to act as residual/skip connections. This helps backpropagation train weights more efficiently in deep NNs by creating a path for the gradient to flow from end to beginning unimpeded by the SA blocks and FFNNs.
    # In GPT models, layer norm is done on inputs instead of outputs
    x = x + self.self_attention(self.ln1(x))
    x = x + self.feedforward(self.ln2(x))
    return x


class GPTModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embedding_dimensions) # Lookup table that stores embeddings for each token in the vocabulary
    self.position_embedding_table = nn.Embedding(max_context_length, n_embedding_dimensions) # Lookup table that stores embeddings for each possible position in the context
    self.transformer_blocks = nn.Sequential(*[TransformerBlock(n_embedding_dimensions, n_heads=n_heads) for _ in range(n_transformer_blocks)]) # Asterisk is for unpacking the list of transformer blocks into separate arguments
    self.ln_final =  nn.LayerNorm(n_embedding_dimensions) # Final layer normalization
    self.logit_generation = nn.Linear(n_embedding_dimensions, vocab_size) # We have a linear layer generate logits (raw probability scores) from the outputs of the transformer blocks instead of having the transformer blocks generate logits directly

    self.apply(self._init_weights) # Initialize every parameter throughout entire model recursively

  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
      if module.bias is not None:
        torch.nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, input_tokens, targets=None):
    B, T = input_tokens.shape # B is Batch, T is Time-step (tokens)

    tok_emb = self.token_embedding_table(input_tokens) # (B,T,C) where C is Channel (n_embedding_dimensions)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,n_embedding_dimensions)  The arange produces a tensor of integers from 0 to T-1
    x = tok_emb + pos_emb # (B,T,n_embedding_dimensions) This step integrates both types of embeddings by adding pos_emb across all batches
    x = self.transformer_blocks(x) # (B,T,n_embedding_dimensions) The output of this stack of transformer blocks is a vector for each individual time step, containing information about the sequence of previous tokens + the current token for that time step. A lot of embedding re-interpretation happens throughout this process.
    x = self.ln_final(x) # (B,T,n_embedding_dimensions)
    logits = self.logit_generation(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:                               # In this forward function, logits and loss are calculated for all context lengths, including 1, 2, all the way to the max context length so that when the time does come that the # of input tokens is less than the max limit, like just 1 token, 2 tokens, etc., predictions for those situations will have already been trained to be accurate as well
      B, T, C = logits.shape
      logits = logits.view(B*T, C)      # Removes boundaries between different batches and converts to a 2D tensor showing time-steps on one axis and vocab_size logit probability scores on the other axis
      targets = targets.view(B*T)       # Removed boundaries between different batches and converts to a 1D array showing the corresponding target token for a time-step
      loss = functional.cross_entropy(logits, targets) # Calculate classification cross entropy loss with "targets" as the training classification labels

    return logits, loss

  def generate(self, input_tokens, max_new_tokens):
    for _ in range(max_new_tokens):
      cropped_input = input_tokens[:, -max_context_length:] # Only take in max_context_length amount of context for the next token prediction
      logits, loss = self(cropped_input) # Get predictions

      logits = logits[:, -1, :] # Only retrieve the prediction at the last T position, a.k.a. where all cropped_input tokens are used as context instead of just a subset of those tokens
      probabilities = functional.softmax(logits, dim=-1) # (B,C) where C is vocab size
      next_token = torch.multinomial(probabilities, num_samples=1) # (B,1)  Sample the next token from the softmaxed probabilities
      print(decode([next_token.item()]), end='')
      input_tokens = torch.cat((input_tokens, next_token), dim=1) # (B,T+1)  Add the new token to the context for the next token prediction
    return input_tokens



## Main code

In [52]:
# https://www.youtube.com/watch?v=kCc8FmEb1nY

#
# Configuration variables
#

device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_context_length = 256
num_batches = 8 #64
max_iters = 50 #5000
eval_interval = 2 #500
n_eval_iterations = 5 #200
learning_rate = 3e-4
dropout_rate = 0.2

n_embedding_dimensions = 384
n_heads = 6
n_transformer_blocks = 6

In [11]:
model = GPTModel()
m = model.to(device)

# Print # of model parameters
print(sum(param_tensor.numel() for param_tensor in m.parameters())/1e6, 'M parameters')

# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

10.81123 M parameters


In [53]:
# Training loop
for iter in range(max_iters):

  # Evaluate and print loss for training and validation sets
  if iter % eval_interval == 0 or iter == max_iters - 1:
    losses = estimate_loss()
    print(f'Step {iter}: training loss {losses["train"]:.4f}, validation loss {losses["val"]:.4f}')

  # Sample a batch of data
  input_tokens, target_tokens = get_batch('train')

  # Evaluate loss and optimize
  logits, loss = model(input_tokens, target_tokens)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

Step 0: training loss 1.5597, validation loss 1.5756
Step 4: training loss 1.5305, validation loss 1.5584
Step 6: training loss 1.5560, validation loss 1.5222
Step 8: training loss 1.4930, validation loss 1.5061
Step 10: training loss 1.5256, validation loss 1.5923
Step 12: training loss 1.5177, validation loss 1.5522
Step 14: training loss 1.5247, validation loss 1.6094
Step 16: training loss 1.5981, validation loss 1.5525
Step 18: training loss 1.5296, validation loss 1.5539
Step 20: training loss 1.5719, validation loss 1.5680
Step 22: training loss 1.6042, validation loss 1.5507
Step 24: training loss 1.5188, validation loss 1.5145
Step 26: training loss 1.5287, validation loss 1.5669
Step 28: training loss 1.5280, validation loss 1.5436
Step 30: training loss 1.5294, validation loss 1.5535
Step 32: training loss 1.6144, validation loss 1.5928
Step 34: training loss 1.5360, validation loss 1.5870
Step 36: training loss 1.4809, validation loss 1.6149
Step 38: training loss 1.5685, v

In [63]:
# Generate
context = torch.ones((1, 1), dtype=torch.long, device=device)
m.generate(context, max_new_tokens=100)                           #print(decode(m.generate(context, max_new_tokens=100)[0].tolist()))
print()

This the sep and then make life at we do we wain't.
I fruit. imptic.
Oh, did you know like you shooo
