In [1]:
# Herein are cells that 
# 1. Create a text class that has methods to determine the vocabulary
# within the text, encode/decode the text, split the text content into
# validation, training and test sets.
# 
# 2. LSTM language model for training and generating text.
#
# 3. Reading in of a simple test case
#  Training and validation loop for simple test case.
#
# 4. Tiny Shakespeare training and generation. 
# 

# Refactoring - there is a logic in moving the vocab/encoding/decoding 
# methods from the text class into the language model, since the language
# model has to deal with with an appropriately encoded/decoded 
# set of data - so saving the weights of the language model is useless
# without knowing the encodings! On the otherhand ... that's to 
# put two different types of things in the same object ... so leave as
# is for the moment. 

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class text:
    def __init__(self, text_name, is_file):
        
        # Retrieve text
        self.text = ''
        self.encoded = ''
        if is_file == True:
            print("reading text from file")
            self.from_file(text_name)
        else:
            self.from_string(text_name)
        
        # Calculate vocab size, i.e. the number of characters; 
        # first get sorted list of unique characters
        self.chars = sorted(list(set(self.text)))
        #self.vocab_size = self.vocab_size(chars)
        self.vocab_size = len(self.chars)
        self.str_to_int = {}
        self.int_to_str = {}
        self.str_to_int = self.make_str_to_int_table(self.chars)
        self.int_to_str = self.make_int_to_str_table(self.chars)
    
    def from_file(self, filename):
        'Read text from file and calculate vocab size'
        self.text = open(filename,'r',encoding='utf-8').read()
    
    def from_string(self, string):
        'Read text from string and calculate vocab size'
        self.text = string
    
    #def calc_vocab_size(self, chars):
    #    'Calculate vocab size'
    #    self.vocab_size = len(chars)
        
    def make_str_to_int_table(self, chars):
        '''Populate dictionary of character to integer mapping'''
        return  {character: int for int, character in enumerate(chars)}
    
    def make_int_to_str_table(self, chars):
        '''Populate dictionary of character to integer mapping'''
        return  {int: character for int, character in enumerate(chars)}
    
    def encode_text_as_tensor(self):
        '''encode the training text as a list of integers 
        and then convert to tensor with which to replace self.text'''
        encode = lambda char:[self.str_to_int[char] for char in self.text]
        self.text = torch.tensor(encode(self.text), dtype = torch.long)
    
    def encode_new_text_as_tensor(self, to_encode):
        '''encode a new text as a list of integers, according to the 
        encoding derived from the training text. Return a tensor'''
        encode = lambda char:[self.str_to_int[char] for char in to_encode]
        return torch.tensor(encode(to_encode), dtype = torch.long)

    def decode(self, to_decode):
        '''decode from a list of integers to a string, using the
        encoding vocabulary attached to the object: self.int_to_str '''  
        decode = lambda l: ''.join([self.int_to_str[i] for i in l])
        return decode(to_decode)
        
    def __str__(self):
        return f"{self.vocab_size}"
    
    def make_train_val_test(self, fraction_train, fraction_val, fraction_test):
        '''simple train /validation sets.  no randomisation 
         of selections, so assuming  no bias in the distribution within the data file'''
        if fraction_train + fraction_test + fraction_val != 1:
            print("Warning, fractions of train, test and validation do \
                  not add to one.")
        n = int(fraction_train*len(self.text))
        nv = int(fraction_val*len(self.text))
        nt = int(fraction_test*len(self.text))
        self.train_data = self.text[:n]
        self.val_data = self.text[n:n+nv]
        self.test_data = self.text[n+nv:n+nv+nt]
    
    def get_batch(self, batch_size, block_size, train_test_validation):
        """Randomly pick data from the training data/test data/validation
        and return as a batch stacked in a torch tensor."""
        if train_test_validation == "train":
            data = self.train_data
        elif train_test_validation == "test":
            data = self.test_data
        elif train_test_validation == "validation":
            data = self.val_data
        else:
            raise \
            ValueError("Specify data set as 'train', 'test', or 'validation'.")

        if len(data) < block_size:
            raise ValueError("Data is smaller than the specified block size.")

        ix = torch.randint(len(data) - block_size, (batch_size,))
        train_context_batch = torch.stack([data[i:i + block_size] for i in ix])
        train_to_predict_batch =\
              torch.stack([data[i + 1: i + block_size + 1] for i in ix])
        return train_context_batch, train_to_predict_batch

@torch.no_grad()
def estimate_loss(number_eval_batches, 
                  language_model, training_text_object, block_size):
  '''Take number_eval_batches from training and validation sets and 
  calculate an average loss for each. Return a dictionary with the
  two losses, with keys train and validation '''
  out = {}
  language_model.eval()
  for split in ['train', 'validation']:
    losses = torch.zeros(number_eval_batches)
    for k in range(number_eval_batches):
      X, Y = training_text_object.get_batch(1, block_size, split)
      logits, loss = language_model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  language_model.train()
  return out

In [4]:
# LSTM  
#
class LanguageModel_LSTM(nn.Module):
  '''logits produced from an LSTM over the context (length block_size) 
   of the previous characters, 
    from which we are trying to predict the current character; 
  generate() uses logits as a multivariate distribution from which 
    to predict next character. 
  
   Training learns the values in the embedding vector, the weights of 
    RNN and the weights of the fully connected layer prior to the
     softmax '''

  def __init__(self, vocab_size, hidden_size, num_layers, verbose=False):
    super().__init__()
    self.verbose=verbose
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    # make all embeddings small so the logits produced give a
    # probability distribution is equal in all direction.
    # maximum initial entropy should give the most likely low cross entropy, 
    # since initial guess is not likely to be better than equal uncertainty.
    with torch.no_grad():
      self.token_embedding_table.weight.data \
      = self.token_embedding_table.weight.data * 0.01
    self.input_size = vocab_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers,\
                       batch_first=True) # (h0, c0) default to zero.
    self.fully_connected = nn.Sequential(
                       nn.Linear(hidden_size, hidden_size),
                       nn.ReLU(),
                       nn.Linear(hidden_size, vocab_size)
                       )

    
  def set_verbose(self, verbose):
    self.verbose = verbose

  def forward(self, idx, targets=None):
    # idx - (index of) the x values, i.e. the context vector
    # idx and targets are both (B, T) tensor of integers (see comment below)
    #h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
    # default h0 for RNN model  is to set everything to zeros
    logits = self.token_embedding_table(idx) # (B, T, C)
    # nn.RNN produces batch_size, sequ, hidden_size,
    out, _ = self.lstm(logits)




    #logits = self.fully_connected(logits)
    logits = self.fully_connected(out)
    # B T C is batch by time by channel
    # batch  is number of the batch
    # time is block size
    # channel is the vocab size 
    if targets == None:
      loss = None
    else:
      B, T, C = logits.shape # need to reshape to be correct shape for cross_entropy
      # use view in pytorch that changes the view of the data passed but not the
      # underlying data. Flatten the first two dimensions of the logits, 
      # to create
      # a batch of size B*T with the channel data,
      # i.e. vocab or probability of each character/class as the second
      # dimension, which is what F.cross_entropy expects.
      # Similarly, targets reduced to 1 dimension of batch data (B*T) 
      # with class lable in each dimension. 
      if self.verbose: print(f"Targets: \n {targets} \n; logits: \n{logits} ")
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      if self.verbose: print(f"Transformed view: Targets: \n {targets} \n; logits:\n{logits}\n")
      # cross_entropy includes a softmax transformation. 
      if self.verbose:
        probs = F.softmax(logits, dim=1)
        idx_next = torch.multinomial(probs, num_samples = 1) 
        print(f"Generative outcomes during training:\n {idx_next}")

      loss = F.cross_entropy(logits, targets)
    return logits, loss
  
  def generate(self, idx, max_new_tokens):
    '''Predict the next character/token from the learnt distribution,adding
    it to the current context, idx, till max_new_tokens have been added.'''
    for _ in range(max_new_tokens):
      #get the predictions
      logits, _ = self(idx) # calls forwards() for this class
      
      # Following line: 
      # h_out from the last character of each sequence in the batch.
      logits = logits[:, -1, :] 
      # logits now has dimension B, C, since only one token.
      
      # Generate probability distribution
      probs = F.softmax(logits, dim=-1) # B, C
      if self.verbose:
          print(f'logits {logits}, {logits.shape} \n probs {probs}, {probs.shape}')
      
      # Sample  vocabulary according to the probability distribution, probs.
      idx_next = torch.multinomial(probs, num_samples = 1) # B by 1 size
      # Append sample to sequence
      idx = torch.cat((idx, idx_next), dim = 1)
    return idx

In [6]:
# Train Simple RNN language model on "ABCD EFGH IJKL MNOP QRSTU VWXYZ " string.
# Define string
# Read string into object
# Encode string and place in a tensor
string = "ABCD EFGH IJKL MNOP QRSTU VWXYZ "
string = string * 100
AB_string_3 = text(string, is_file=False)
AB_string_3.encode_text_as_tensor()
AB_string_3.make_train_val_test(0.8, 0.1, 0.1)
print(AB_string_3.train_data)
print(AB_string_3.vocab_size)

tensor([ 1,  2,  3,  ..., 25, 26,  0])
27


In [13]:
hidden_size = AB_string_3.vocab_size
input_size = AB_string_3.vocab_size
num_layers = 1
m3 = LanguageModel_LSTM(input_size, hidden_size, num_layers, verbose=False)

# check code is doing what I expect it to do. 
# Keep batch_size and block_size small to allow easy visualisation of 
# what is going on inside the network, to check that it's doing what I 
# think it's doing.
# Uncommented print statements in code
batch_size=2
block_size =4
number_of_evaluation_batches = 10
evaluation_interval = 500
# high learning rate since simple network
learning_rate = 1e-3
#logits, loss = m(train_context_batch, train_to_predict_batch)
#print(logits.shape) 
#print(loss)

# training loop. 
optimizer = torch.optim.AdamW(m3.parameters(), lr=learning_rate) 
for step in range(4000):
  # get a new batch
  xb, yb = AB_string_3.get_batch(batch_size, block_size, 'train')
  logits, loss = m3(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
   #equally uncertain distribution implies 1/vocab_size = 1/65 probability of each 
   # next character which implies -ln(1/65) = 4.17 would be optimal initial loss
  if step % evaluation_interval == 0: 
    print(f"current iteration loss, at step {step}: {loss.item()}")
  if step % evaluation_interval == 0:
    #logits_val, loss_val = m(xval, yval)
    #print(f"training loss: {loss.item()}, \
    #                                       validation loss {loss_val.item()}")
    losses = estimate_loss(number_of_evaluation_batches, m3, AB_string_3, block_size)
    print(f"step {step}: \n \
           loss averaged over {number_of_evaluation_batches} batches:\
           \n training loss: {losses['train']:.4f}, \
          \n validation loss {losses['validation']:.4f}\n\n")
losses = estimate_loss(number_of_evaluation_batches, m3, AB_string_3, block_size)
print(f"final loss: \n \
        loss averaged over {number_of_evaluation_batches} batches:\
        \n training loss: {losses['train']:.4f}, \
        \n validation loss {losses['validation']:.4f}\n\n")

current iteration loss, at step 0: 3.353686571121216
step 0: 
            loss averaged over 10 batches:           
 training loss: 3.3386,           
 validation loss 3.3218


current iteration loss, at step 500: 1.8055510520935059
step 500: 
            loss averaged over 10 batches:           
 training loss: 1.3512,           
 validation loss 1.1075


current iteration loss, at step 1000: 0.29412171244621277
step 1000: 
            loss averaged over 10 batches:           
 training loss: 0.3203,           
 validation loss 0.3035


current iteration loss, at step 1500: 0.12038694322109222
step 1500: 
            loss averaged over 10 batches:           
 training loss: 0.1839,           
 validation loss 0.2092


current iteration loss, at step 2000: 0.04319122061133385
step 2000: 
            loss averaged over 10 batches:           
 training loss: 0.2336,           
 validation loss 0.1353


current iteration loss, at step 2500: 0.006636984646320343
step 2500: 
            los

In [15]:
context = ' '
context = AB_string_3.encode_new_text_as_tensor(context)
print(context)
context = torch.unsqueeze(context, dim=0)
print(context)
m3.set_verbose(False)
new_string = m3.generate(context,25)
print(new_string)
print(AB_string_3.decode(new_string[0].tolist()))

tensor([0])
tensor([[0]])
tensor([[ 0,  5,  6,  7,  8,  0,  9, 10, 11, 12,  0, 13, 14, 15, 16,  0, 17, 18,
         19, 20, 21,  0, 22, 23, 24, 25]])
 EFGH IJKL MNOP QRSTU VWXY


In [None]:
# Next, tiny Shakespeare
print("shakespeare")
shakespeare = text("input.txt", is_file=True)
#shakespeare.from_file("input.txt")
print(f"chars, {shakespeare.chars}")
print(f"vocab size: {shakespeare.vocab_size}")
shakespeare.encode_text_as_tensor()
shakespeare.make_train_val_test(0.8, 0.1, 0.1)
