In [4]:
# Herein are cells that 
# 1. Create a text class that has methods to determine the vocabulary
# within the text, encode/decode the text, split the text content into
# validation, training and test sets.
# 
# 2. LSTM language model for training and generating text.
#
# 3. Reading in of a simple test case
#  Training and validation loop for simple test case.
#
# 4. Tiny Shakespeare training and generation. 
# 

# Refactoring - there is a logic in moving the vocab/encoding/decoding 
# methods from the text class into the language model, since the language
# model has to deal with with an appropriately encoded/decoded 
# set of data - so saving the weights of the language model is useless
# without knowing the encodings! On the otherhand ... that's to 
# put two different types of things in the same object ... so leave as
# is for the moment. 

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class text:
    def __init__(self, text_name, is_file):
        
        # Retrieve text
        self.text = ''
        self.encoded = ''
        if is_file == True:
            print("reading text from file")
            self.from_file(text_name)
        else:
            self.from_string(text_name)
        
        # Calculate vocab size, i.e. the number of characters; 
        # first get sorted list of unique characters
        self.chars = sorted(list(set(self.text)))
        #self.vocab_size = self.vocab_size(chars)
        self.vocab_size = len(self.chars)
        self.str_to_int = {}
        self.int_to_str = {}
        self.str_to_int = self.make_str_to_int_table(self.chars)
        self.int_to_str = self.make_int_to_str_table(self.chars)
    
    def from_file(self, filename):
        'Read text from file and calculate vocab size'
        self.text = open(filename,'r',encoding='utf-8').read()
    
    def from_string(self, string):
        'Read text from string and calculate vocab size'
        self.text = string
    
    #def calc_vocab_size(self, chars):
    #    'Calculate vocab size'
    #    self.vocab_size = len(chars)
        
    def make_str_to_int_table(self, chars):
        '''Populate dictionary of character to integer mapping'''
        return  {character: int for int, character in enumerate(chars)}
    
    def make_int_to_str_table(self, chars):
        '''Populate dictionary of character to integer mapping'''
        return  {int: character for int, character in enumerate(chars)}
    
    def encode_text_as_tensor(self):
        '''encode the training text as a list of integers 
        and then convert to tensor with which to replace self.text'''
        encode = lambda char:[self.str_to_int[char] for char in self.text]
        self.text = torch.tensor(encode(self.text), dtype = torch.long)
    
    def encode_new_text_as_tensor(self, to_encode):
        '''encode a new text as a list of integers, according to the 
        encoding derived from the training text. Return a tensor'''
        encode = lambda char:[self.str_to_int[char] for char in to_encode]
        return torch.tensor(encode(to_encode), dtype = torch.long)

    def decode(self, to_decode):
        '''decode from a list of integers to a string, using the
        encoding vocabulary attached to the object: self.int_to_str '''  
        decode = lambda l: ''.join([self.int_to_str[i] for i in l])
        return decode(to_decode)
        
    def __str__(self):
        return f"{self.vocab_size}"
    
    def make_train_val_test(self, fraction_train, fraction_val, fraction_test):
        '''simple train /validation sets.  no randomisation 
         of selections, so assuming  no bias in the distribution within the data file'''
        if fraction_train + fraction_test + fraction_val != 1:
            print("Warning, fractions of train, test and validation do \
                  not add to one.")
        n = int(fraction_train*len(self.text))
        nv = int(fraction_val*len(self.text))
        nt = int(fraction_test*len(self.text))
        self.train_data = self.text[:n]
        self.val_data = self.text[n:n+nv]
        self.test_data = self.text[n+nv:n+nv+nt]
    
    def get_batch(self, batch_size, block_size, train_test_validation):
        """Randomly pick data from the training data/test data/validation
        and return as a batch stacked in a torch tensor."""
        if train_test_validation == "train":
            data = self.train_data
        elif train_test_validation == "test":
            data = self.test_data
        elif train_test_validation == "validation":
            data = self.val_data
        else:
            raise \
            ValueError("Specify data set as 'train', 'test', or 'validation'.")

        if len(data) < block_size:
            raise ValueError("Data is smaller than the specified block size.")

        ix = torch.randint(len(data) - block_size, (batch_size,))
        train_context_batch = torch.stack([data[i:i + block_size] for i in ix])
        train_to_predict_batch =\
              torch.stack([data[i + 1: i + block_size + 1] for i in ix])
        return train_context_batch, train_to_predict_batch

@torch.no_grad()
def estimate_loss(number_eval_batches, 
                  language_model, training_text_object, block_size):
  '''Take number_eval_batches from training and validation sets and 
  calculate an average loss for each. Return a dictionary with the
  two losses, with keys train and validation '''
  out = {}
  language_model.eval()
  for split in ['train', 'validation']:
    losses = torch.zeros(number_eval_batches)
    for k in range(number_eval_batches):
      X, Y = training_text_object.get_batch(1, block_size, split)
      logits, loss = language_model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  language_model.train()
  return out

In [5]:
# LSTM  
#
class LanguageModel_LSTM(nn.Module):
  '''logits produced from an LSTM over the context (length block_size) 
   of the previous characters, 
    from which we are trying to predict the current character; 
  generate() uses logits as a multivariate distribution from which 
    to predict next character. 
  
   Training learns the values in the embedding vector, the weights of 
    RNN and the weights of the fully connected layer prior to the
     softmax '''

  def __init__(self, vocab_size, hidden_size, num_layers, verbose=False):
    super().__init__()
    self.verbose=verbose
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    # make all embeddings small so the logits produced give a
    # probability distribution is equal in all direction.
    # maximum initial entropy should give the most likely low cross entropy, 
    # since initial guess is not likely to be better than equal uncertainty.
    with torch.no_grad():
      self.token_embedding_table.weight.data \
      = self.token_embedding_table.weight.data * 0.01
    self.input_size = vocab_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers,\
                       batch_first=True) # (h0, c0) default to zero.
    self.fully_connected = nn.Sequential(
                       nn.Linear(self.hidden_size, self.hidden_size),
                       nn.ReLU(),
                       nn.Linear(self.hidden_size, vocab_size)
                       )

    
  def set_verbose(self, verbose):
    self.verbose = verbose

  def forward(self, idx, targets=None):
    # idx - (index of) the x values, i.e. the context vector
    # idx and targets are both (B, T) tensor of integers (see comment below)
    #h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
    # default h0 for RNN model  is to set everything to zeros
    logits = self.token_embedding_table(idx) # (B, T, C)
    # nn.RNN produces batch_size, sequ, hidden_size,
    out, _ = self.lstm(logits)




    #logits = self.fully_connected(logits)
    logits = self.fully_connected(out)
    # B T C is batch by time by channel
    # batch  is number of the batch
    # time is block size
    # channel is the vocab size 
    if targets == None:
      loss = None
    else:
      B, T, C = logits.shape # need to reshape to be correct shape for cross_entropy
      # use view in pytorch that changes the view of the data passed but not the
      # underlying data. Flatten the first two dimensions of the logits, 
      # to create
      # a batch of size B*T with the channel data,
      # i.e. vocab or probability of each character/class as the second
      # dimension, which is what F.cross_entropy expects.
      # Similarly, targets reduced to 1 dimension of batch data (B*T) 
      # with class lable in each dimension. 
      if self.verbose: print(f"Targets: \n {targets} \n; logits: \n{logits} ")
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      if self.verbose: print(f"Transformed view: Targets: \n {targets} \n; logits:\n{logits}\n")
      # cross_entropy includes a softmax transformation. 
      if self.verbose:
        probs = F.softmax(logits, dim=1)
        idx_next = torch.multinomial(probs, num_samples = 1) 
        print(f"Generative outcomes during training:\n {idx_next}")

      loss = F.cross_entropy(logits, targets)
    return logits, loss
  
  def generate(self, idx, max_new_tokens):
    '''Predict the next character/token from the learnt distribution,adding
    it to the current context, idx, till max_new_tokens have been added.'''
    for _ in range(max_new_tokens):
      #get the predictions
      logits, _ = self(idx) # calls forwards() for this class
      
      # Following line: 
      # h_out from the last character of each sequence in the batch.
      logits = logits[:, -1, :] 
      # logits now has dimension B, C, since only one token.
      
      # Generate probability distribution
      probs = F.softmax(logits, dim=-1) # B, C
      if self.verbose:
          print(f'logits {logits}, {logits.shape} \n probs {probs}, {probs.shape}')
      
      # Sample  vocabulary according to the probability distribution, probs.
      idx_next = torch.multinomial(probs, num_samples = 1) # B by 1 size
      # Append sample to sequence
      idx = torch.cat((idx, idx_next), dim = 1)
    return idx

In [6]:
# Train language model on "ABCD EFGH IJKL MNOP QRSTU VWXYZ " string.
# Define string
# Read string into object
# Encode string and place in a tensor
string = "ABCD EFGH IJKL MNOP QRSTU VWXYZ "
string = string * 100
AB_string_3 = text(string, is_file=False)
AB_string_3.encode_text_as_tensor()
AB_string_3.make_train_val_test(0.8, 0.1, 0.1)
print(AB_string_3.train_data)
print(AB_string_3.vocab_size)

tensor([ 1,  2,  3,  ..., 25, 26,  0])
27


  self.text = torch.tensor(encode(self.text), dtype = torch.long)


In [7]:
hidden_size = AB_string_3.vocab_size
input_size = AB_string_3.vocab_size
num_layers = 1
m3 = LanguageModel_LSTM(input_size, hidden_size, num_layers, verbose=False)

# check code is doing what I expect it to do. 
# Keep batch_size and block_size small to allow easy visualisation of 
# what is going on inside the network, to check that it's doing what I 
# think it's doing.
# Uncommented print statements in code
batch_size=2
block_size =4
number_of_evaluation_batches = 10
evaluation_interval = 500
# high learning rate since simple network
learning_rate = 1e-3
#logits, loss = m(train_context_batch, train_to_predict_batch)
#print(logits.shape) 
#print(loss)

# training loop. 
optimizer = torch.optim.AdamW(m3.parameters(), lr=learning_rate) 
for step in range(4000):
  # get a new batch
  xb, yb = AB_string_3.get_batch(batch_size, block_size, 'train')
  logits, loss = m3(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
   #equally uncertain distribution implies 1/vocab_size = 1/65 probability of each 
   # next character which implies -ln(1/65) = 4.17 would be optimal initial loss
  if step % evaluation_interval == 0: 
    print(f"current iteration loss, at step {step}: {loss.item()}")
  if step % evaluation_interval == 0:
    #logits_val, loss_val = m(xval, yval)
    #print(f"training loss: {loss.item()}, \
    #                                       validation loss {loss_val.item()}")
    losses = estimate_loss(number_of_evaluation_batches, m3, AB_string_3, block_size)
    print(f"step {step}: \n \
           loss averaged over {number_of_evaluation_batches} batches:\
           \n training loss: {losses['train']:.4f}, \
          \n validation loss {losses['validation']:.4f}\n\n")
losses = estimate_loss(number_of_evaluation_batches, m3, AB_string_3, block_size)
print(f"final loss: \n \
        loss averaged over {number_of_evaluation_batches} batches:\
        \n training loss: {losses['train']:.4f}, \
        \n validation loss {losses['validation']:.4f}\n\n")

current iteration loss, at step 0: 3.2831077575683594
step 0: 
            loss averaged over 10 batches:           
 training loss: 3.3302,           
 validation loss 3.3374


current iteration loss, at step 500: 0.893225908279419
step 500: 
            loss averaged over 10 batches:           
 training loss: 1.0081,           
 validation loss 1.1495


current iteration loss, at step 1000: 0.36434099078178406
step 1000: 
            loss averaged over 10 batches:           
 training loss: 0.1959,           
 validation loss 0.2853


current iteration loss, at step 1500: 0.036115728318691254
step 1500: 
            loss averaged over 10 batches:           
 training loss: 0.2230,           
 validation loss 0.0806


current iteration loss, at step 2000: 0.24428200721740723
step 2000: 
            loss averaged over 10 batches:           
 training loss: 0.0996,           
 validation loss 0.0669


current iteration loss, at step 2500: 0.010101783089339733
step 2500: 
            lo

In [8]:
context = ' '
context = AB_string_3.encode_new_text_as_tensor(context)
print(context)
context = torch.unsqueeze(context, dim=0)
print(context)
m3.set_verbose(False)
new_string = m3.generate(context,25)
print(new_string)
print(AB_string_3.decode(new_string[0].tolist()))

tensor([0])
tensor([[0]])
tensor([[ 0, 13, 14, 15, 16,  0, 17, 18, 19, 20, 21,  0, 22, 23, 24, 25, 26,  0,
          1,  2,  3,  4,  0,  5,  6,  7]])
 MNOP QRSTU VWXYZ ABCD EFG


In [9]:
# Next, tiny Shakespeare
print("shakespeare")
shakespeare = text("input.txt", is_file=True)
#shakespeare.from_file("input.txt")
print(f"chars, {shakespeare.chars}")
print(f"vocab size: {shakespeare.vocab_size}")
shakespeare.encode_text_as_tensor()
shakespeare.make_train_val_test(0.8, 0.1, 0.1)


shakespeare
reading text from file
chars, ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
vocab size: 65


In [10]:
hidden_size = shakespeare.vocab_size*2
input_size = shakespeare.vocab_size
num_layers = 3
shakespeare_predictor = LanguageModel_LSTM(input_size, \
                                hidden_size, num_layers, verbose=False)

# check code is doing what I expect it to do. 
# Keep batch_size and block_size small to allow easy visualisation of 
# what is going on inside the network, to check that it's doing what I 
# think it's doing.

batch_size=20
block_size =100
number_of_evaluation_batches = 20
evaluation_interval = 500
learning_rate = 1e-3

# training loop. 
optimizer =\
      torch.optim.AdamW(shakespeare_predictor.parameters(), lr=learning_rate) 
for step in range(4000):
  # get a new batch
  xb, yb = shakespeare.get_batch(batch_size, block_size, 'train')
  logits, loss = shakespeare_predictor(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if step % evaluation_interval == 0: 
    print(f"current iteration loss, at step {step}: {loss.item()}")
  if step % evaluation_interval == 0:
    losses = estimate_loss(number_of_evaluation_batches, \
                           shakespeare_predictor, shakespeare, block_size)
    print(f"step {step}: \n \
           loss averaged over {number_of_evaluation_batches} batches:\
           \n training loss: {losses['train']:.4f}, \
          \n validation loss {losses['validation']:.4f}\n\n")
losses = estimate_loss(number_of_evaluation_batches,\
                        shakespeare_predictor, shakespeare, block_size)
print(f"final loss: \n \
        loss averaged over {number_of_evaluation_batches} batches:\
        \n training loss: {losses['train']:.4f}, \
        \n validation loss {losses['validation']:.4f}\n\n")

current iteration loss, at step 0: 4.202329158782959
step 0: 
            loss averaged over 20 batches:           
 training loss: 4.1947,           
 validation loss 4.1932


current iteration loss, at step 500: 2.594022512435913
step 500: 
            loss averaged over 20 batches:           
 training loss: 2.5751,           
 validation loss 2.5946


current iteration loss, at step 1000: 2.3799805641174316
step 1000: 
            loss averaged over 20 batches:           
 training loss: 2.3986,           
 validation loss 2.4757


current iteration loss, at step 1500: 2.296194553375244
step 1500: 
            loss averaged over 20 batches:           
 training loss: 2.2671,           
 validation loss 2.2999


current iteration loss, at step 2000: 2.183516263961792
step 2000: 
            loss averaged over 20 batches:           
 training loss: 2.1854,           
 validation loss 2.2326


current iteration loss, at step 2500: 2.0272293090820312
step 2500: 
            loss averag

In [11]:

# continue with another 10 000 steps - loss is coming down slowly
learning_rate = 1e-3

# training loop. 
optimizer =\
      torch.optim.AdamW(shakespeare_predictor.parameters(), lr=learning_rate) 
for step in range(10000):
  # get a new batch
  xb, yb = shakespeare.get_batch(batch_size, block_size, 'train')
  logits, loss = shakespeare_predictor(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if step % evaluation_interval == 0: 
    print(f"current iteration loss, at step {step}: {loss.item()}")
  if step % evaluation_interval == 0:
    losses = estimate_loss(number_of_evaluation_batches, \
                           shakespeare_predictor, shakespeare, block_size)
    print(f"step {step}: \n \
           loss averaged over {number_of_evaluation_batches} batches:\
           \n training loss: {losses['train']:.4f}, \
          \n validation loss {losses['validation']:.4f}\n\n")
losses = estimate_loss(number_of_evaluation_batches,\
                        shakespeare_predictor, shakespeare, block_size)
print(f"final loss: \n \
        loss averaged over {number_of_evaluation_batches} batches:\
        \n training loss: {losses['train']:.4f}, \
        \n validation loss {losses['validation']:.4f}\n\n")

current iteration loss, at step 0: 1.9088289737701416
step 0: 
            loss averaged over 20 batches:           
 training loss: 2.1291,           
 validation loss 2.2228


current iteration loss, at step 500: 1.8210737705230713
step 500: 
            loss averaged over 20 batches:           
 training loss: 1.8910,           
 validation loss 1.9360


current iteration loss, at step 1000: 1.863354206085205
step 1000: 
            loss averaged over 20 batches:           
 training loss: 1.8833,           
 validation loss 1.9177


current iteration loss, at step 1500: 1.8386300802230835
step 1500: 
            loss averaged over 20 batches:           
 training loss: 1.8516,           
 validation loss 1.9047


current iteration loss, at step 2000: 1.7058151960372925
step 2000: 
            loss averaged over 20 batches:           
 training loss: 1.7743,           
 validation loss 1.7944


current iteration loss, at step 2500: 1.808297872543335
step 2500: 
            loss aver

In [12]:
# current model seems to be decreasing in loss, albeit very slowly
# try some generative function and then train some more.
context = ' '
context = shakespeare.encode_new_text_as_tensor(context)
print(context)
context = torch.unsqueeze(context, dim=0)
print(context)
shakespeare_predictor.set_verbose(False)
new_string = shakespeare_predictor.generate(context,100)
print(new_string)
print(shakespeare.decode(new_string[0].tolist()))

tensor([1])
tensor([[1]])
tensor([[ 1, 51, 63,  1, 54, 56, 43, 43, 58,  8,  1, 13, 46, 61, 53, 56, 42,  2,
          1, 44, 56, 59, 50, 57, 43,  8,  0,  0, 15, 27, 30, 21, 27, 24, 13, 26,
         33, 31, 10,  0, 14, 43, 41, 56, 47, 60, 43,  1, 39, 52, 42,  1, 51, 39,
         52, 63,  1, 61, 47, 58, 46,  1, 40, 43, 53, 52,  6,  0, 26, 53,  6,  1,
         47, 58,  1, 57, 46, 39, 50, 50,  1, 47, 58, 51, 43, 52, 43, 57, 57,  8,
          0,  0, 29, 33, 17, 17, 26,  1, 17, 24, 21]])
 my preet. Ahword! frulse.

CORIOLANUS:
Becrive and many with beon,
No, it shall itmeness.

QUEEN ELI


In [13]:
# continue with another 10 000 steps - loss is coming down slowly
learning_rate = 1e-3

# training loop. 
optimizer =\
      torch.optim.AdamW(shakespeare_predictor.parameters(), lr=learning_rate) 
for step in range(5000):
  # get a new batch
  xb, yb = shakespeare.get_batch(batch_size, block_size, 'train')
  logits, loss = shakespeare_predictor(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if step % evaluation_interval == 0: 
    print(f"current iteration loss, at step {step}: {loss.item()}")
  if step % evaluation_interval == 0:
    losses = estimate_loss(number_of_evaluation_batches, \
                           shakespeare_predictor, shakespeare, block_size)
    print(f"step {step}: \n \
           loss averaged over {number_of_evaluation_batches} batches:\
           \n training loss: {losses['train']:.4f}, \
          \n validation loss {losses['validation']:.4f}\n\n")
losses = estimate_loss(number_of_evaluation_batches,\
                        shakespeare_predictor, shakespeare, block_size)
print(f"final loss: \n \
        loss averaged over {number_of_evaluation_batches} batches:\
        \n training loss: {losses['train']:.4f}, \
        \n validation loss {losses['validation']:.4f}\n\n")

current iteration loss, at step 0: 1.579905390739441
step 0: 
            loss averaged over 20 batches:           
 training loss: 1.7015,           
 validation loss 1.8380


current iteration loss, at step 500: 1.4419159889221191
step 500: 
            loss averaged over 20 batches:           
 training loss: 1.4974,           
 validation loss 1.6658


current iteration loss, at step 1000: 1.5601158142089844
step 1000: 
            loss averaged over 20 batches:           
 training loss: 1.4682,           
 validation loss 1.5732


current iteration loss, at step 1500: 1.5540715456008911
step 1500: 
            loss averaged over 20 batches:           
 training loss: 1.4807,           
 validation loss 1.6578


current iteration loss, at step 2000: 1.5376372337341309
step 2000: 
            loss averaged over 20 batches:           
 training loss: 1.5385,           
 validation loss 1.6355


current iteration loss, at step 2500: 1.5004305839538574
step 2500: 
            loss ave

In [14]:
# current model seems to be still decreasing in loss, albeit very slowly
# try some generative function and then train some more.
context = ' '
context = shakespeare.encode_new_text_as_tensor(context)
print(context)
context = torch.unsqueeze(context, dim=0)
print(context)
shakespeare_predictor.set_verbose(False)
new_string = shakespeare_predictor.generate(context,150)
print(new_string)
print(shakespeare.decode(new_string[0].tolist()))

tensor([1])
tensor([[1]])
tensor([[ 1, 46, 43, 50, 50,  6,  1, 50, 43, 61, 57,  6,  1, 39, 57, 41, 43, 56,
         58, 50, 43, 11,  1, 21,  1, 39, 51,  1, 58, 46, 47, 57,  1, 42, 43, 39,
         44, 10,  0, 37, 53, 59, 56,  1, 53, 58, 46, 43, 56,  1, 61, 46, 47, 50,
         43,  1, 53, 52, 43,  1, 58, 46, 39, 58,  1, 56, 59, 58, 41, 46, 43, 56,
         52, 58,  1, 53, 52,  1, 51, 43, 10,  0, 21,  1, 57, 46, 39, 50, 50,  1,
         46, 39, 60, 43,  1, 58, 39, 49, 43,  1, 53, 44,  1, 30, 47, 41, 46, 51,
         53, 52, 42,  1, 47, 57,  6,  0, 33, 52, 58, 43, 47, 45, 52, 39, 50, 57,
          1, 47, 52,  1, 51, 63, 57, 43, 50, 44,  6,  1, 58, 46, 39, 58,  1, 58,
         47, 50, 50,  1, 56, 39, 52]])
 hell, lews, ascertle; I am this deaf:
Your other while one that rutchernt on me:
I shall have take of Richmond is,
Unteignals in myself, that till ran
