In [2]:
# Herein are cells that 
# 1. Create a text class that has methods to determine the vocabulary
# within the text, encode/decode the text, split the text content into
# validation, training and test sets.
# 
# 2. LSTM language model for training and generating text.
#
# 3. Reading in of a simple test case
#  Training and validation loop for simple test case.
#
# 4. Tiny Shakespeare training and generation. 
# 

# Refactoring - there is a logic in moving the vocab/encoding/decoding 
# methods from the text class into the language model, since the language
# model has to deal with with an appropriately encoded/decoded 
# set of data - so saving the weights of the language model is useless
# without knowing the encodings! On the otherhand ... that's to 
# put two different types of things in the same object ... so leave as
# is for the moment. 

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class text:
    def __init__(self, text_name, is_file):
        
        # Retrieve text
        self.text = ''
        self.encoded = ''
        if is_file == True:
            print("reading text from file")
            self.from_file(text_name)
        else:
            self.from_string(text_name)
        
        # Calculate vocab size, i.e. the number of characters; 
        # first get sorted list of unique characters
        self.chars = sorted(list(set(self.text)))
        #self.vocab_size = self.vocab_size(chars)
        self.vocab_size = len(self.chars)
        self.str_to_int = {}
        self.int_to_str = {}
        self.str_to_int = self.make_str_to_int_table(self.chars)
        self.int_to_str = self.make_int_to_str_table(self.chars)
    
    def from_file(self, filename):
        'Read text from file and calculate vocab size'
        self.text = open(filename,'r',encoding='utf-8').read()
    
    def from_string(self, string):
        'Read text from string and calculate vocab size'
        self.text = string
    
    #def calc_vocab_size(self, chars):
    #    'Calculate vocab size'
    #    self.vocab_size = len(chars)
        
    def make_str_to_int_table(self, chars):
        '''Populate dictionary of character to integer mapping'''
        return  {character: int for int, character in enumerate(chars)}
    
    def make_int_to_str_table(self, chars):
        '''Populate dictionary of character to integer mapping'''
        return  {int: character for int, character in enumerate(chars)}
    
    def encode_text_as_tensor(self):
        '''encode the training text as a list of integers 
        and then convert to tensor with which to replace self.text'''
        encode = lambda char:[self.str_to_int[char] for char in self.text]
        self.text = torch.tensor(encode(self.text), dtype = torch.long)
    
    def encode_new_text_as_tensor(self, to_encode):
        '''encode a new text as a list of integers, according to the 
        encoding derived from the training text. Return a tensor'''
        encode = lambda char:[self.str_to_int[char] for char in to_encode]
        return torch.tensor(encode(to_encode), dtype = torch.long)

    def decode(self, to_decode):
        '''decode from a list of integers to a string, using the
        encoding vocabulary attached to the object: self.int_to_str '''  
        decode = lambda l: ''.join([self.int_to_str[i] for i in l])
        return decode(to_decode)
        
    def __str__(self):
        return f"{self.vocab_size}"
    
    def make_train_val_test(self, fraction_train, fraction_val, fraction_test):
        '''simple train /validation sets.  no randomisation 
         of selections, so assuming  no bias in the distribution within the data file'''
        if fraction_train + fraction_test + fraction_val != 1:
            print("Warning, fractions of train, test and validation do \
                  not add to one.")
        n = int(fraction_train*len(self.text))
        nv = int(fraction_val*len(self.text))
        nt = int(fraction_test*len(self.text))
        self.train_data = self.text[:n]
        self.val_data = self.text[n:n+nv]
        self.test_data = self.text[n+nv:n+nv+nt]
    
    def get_batch(self, batch_size, block_size, train_test_validation):
        """Randomly pick data from the training data/test data/validation
        and return as a batch stacked in a torch tensor."""
        if train_test_validation == "train":
            data = self.train_data
        elif train_test_validation == "test":
            data = self.test_data
        elif train_test_validation == "validation":
            data = self.val_data
        else:
            raise \
            ValueError("Specify data set as 'train', 'test', or 'validation'.")

        if len(data) < block_size:
            raise ValueError("Data is smaller than the specified block size.")

        ix = torch.randint(len(data) - block_size, (batch_size,))
        train_context_batch = torch.stack([data[i:i + block_size] for i in ix])
        train_to_predict_batch =\
              torch.stack([data[i + 1: i + block_size + 1] for i in ix])
        return train_context_batch, train_to_predict_batch

@torch.no_grad()
def estimate_loss(number_eval_batches, 
                  language_model, training_text_object, block_size):
  '''Take number_eval_batches from training and validation sets and 
  calculate an average loss for each. Return a dictionary with the
  two losses, with keys train and validation '''
  out = {}
  language_model.eval()
  for split in ['train', 'validation']:
    losses = torch.zeros(number_eval_batches)
    for k in range(number_eval_batches):
      X, Y = training_text_object.get_batch(1, block_size, split)
      logits, loss = language_model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  language_model.train()
  return out

In [3]:
# LSTM  
#
class LanguageModel_LSTM(nn.Module):
  '''logits produced from an LSTM over the context (length block_size) 
   of the previous characters, 
    from which we are trying to predict the current character; 
  generate() uses logits as a multivariate distribution from which 
    to predict next character. 
  
   Training learns the values in the embedding vector, the weights of 
    RNN and the weights of the fully connected layer prior to the
     softmax '''

  def __init__(self, vocab_size, hidden_size, num_layers, verbose=False):
    super().__init__()
    self.verbose=verbose
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    # make all embeddings small so the logits produced give a
    # probability distribution is equal in all direction.
    # maximum initial entropy should give the most likely low cross entropy, 
    # since initial guess is not likely to be better than equal uncertainty.
    with torch.no_grad():
      self.token_embedding_table.weight.data \
      = self.token_embedding_table.weight.data * 0.01
    self.input_size = vocab_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers,\
                       batch_first=True) # (h0, c0) default to zero.
    self.fully_connected = nn.Sequential(
                       nn.Linear(self.hidden_size, self.hidden_size),
                       nn.ReLU(),
                       nn.Linear(self.hidden_size, vocab_size)
                       )

    
  def set_verbose(self, verbose):
    self.verbose = verbose

  def forward(self, idx, targets=None):
    # idx - (index of) the x values, i.e. the context vector
    # idx and targets are both (B, T) tensor of integers (see comment below)
    #h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
    # default h0 for RNN model  is to set everything to zeros
    logits = self.token_embedding_table(idx) # (B, T, C)
    # nn.RNN produces batch_size, sequ, hidden_size,
    out, _ = self.lstm(logits)




    #logits = self.fully_connected(logits)
    logits = self.fully_connected(out)
    # B T C is batch by time by channel
    # batch  is number of the batch
    # time is block size
    # channel is the vocab size 
    if targets == None:
      loss = None
    else:
      B, T, C = logits.shape # need to reshape to be correct shape for cross_entropy
      # use view in pytorch that changes the view of the data passed but not the
      # underlying data. Flatten the first two dimensions of the logits, 
      # to create
      # a batch of size B*T with the channel data,
      # i.e. vocab or probability of each character/class as the second
      # dimension, which is what F.cross_entropy expects.
      # Similarly, targets reduced to 1 dimension of batch data (B*T) 
      # with class lable in each dimension. 
      if self.verbose: print(f"Targets: \n {targets} \n; logits: \n{logits} ")
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      if self.verbose: print(f"Transformed view: Targets: \n {targets} \n; logits:\n{logits}\n")
      # cross_entropy includes a softmax transformation. 
      if self.verbose:
        probs = F.softmax(logits, dim=1)
        idx_next = torch.multinomial(probs, num_samples = 1) 
        print(f"Generative outcomes during training:\n {idx_next}")

      loss = F.cross_entropy(logits, targets)
    return logits, loss
  
  def generate(self, idx, max_new_tokens):
    '''Predict the next character/token from the learnt distribution,adding
    it to the current context, idx, till max_new_tokens have been added.'''
    for _ in range(max_new_tokens):
      #get the predictions
      logits, _ = self(idx) # calls forwards() for this class
      
      # Following line: 
      # h_out from the last character of each sequence in the batch.
      logits = logits[:, -1, :] 
      # logits now has dimension B, C, since only one token.
      
      # Generate probability distribution
      probs = F.softmax(logits, dim=-1) # B, C
      if self.verbose:
          print(f'logits {logits}, {logits.shape} \n probs {probs}, {probs.shape}')
      
      # Sample  vocabulary according to the probability distribution, probs.
      idx_next = torch.multinomial(probs, num_samples = 1) # B by 1 size
      # Append sample to sequence
      idx = torch.cat((idx, idx_next), dim = 1)
    return idx

In [6]:
# Train language model on "ABCD EFGH IJKL MNOP QRSTU VWXYZ " string.
# Define string
# Read string into object
# Encode string and place in a tensor
string = "ABCD EFGH IJKL MNOP QRSTU VWXYZ "
string = string * 100
AB_string_3 = text(string, is_file=False)
AB_string_3.encode_text_as_tensor()
AB_string_3.make_train_val_test(0.8, 0.1, 0.1)
print(AB_string_3.train_data)
print(AB_string_3.vocab_size)

tensor([ 1,  2,  3,  ..., 25, 26,  0])
27


  self.text = torch.tensor(encode(self.text), dtype = torch.long)


In [7]:
hidden_size = AB_string_3.vocab_size
input_size = AB_string_3.vocab_size
num_layers = 1
m3 = LanguageModel_LSTM(input_size, hidden_size, num_layers, verbose=False)

# check code is doing what I expect it to do. 
# Keep batch_size and block_size small to allow easy visualisation of 
# what is going on inside the network, to check that it's doing what I 
# think it's doing.
# Uncommented print statements in code
batch_size=2
block_size =4
number_of_evaluation_batches = 10
evaluation_interval = 500
# high learning rate since simple network
learning_rate = 1e-3
#logits, loss = m(train_context_batch, train_to_predict_batch)
#print(logits.shape) 
#print(loss)

# training loop. 
optimizer = torch.optim.AdamW(m3.parameters(), lr=learning_rate) 
for step in range(4000):
  # get a new batch
  xb, yb = AB_string_3.get_batch(batch_size, block_size, 'train')
  logits, loss = m3(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
   #equally uncertain distribution implies 1/vocab_size = 1/65 probability of each 
   # next character which implies -ln(1/65) = 4.17 would be optimal initial loss
  if step % evaluation_interval == 0: 
    print(f"current iteration loss, at step {step}: {loss.item()}")
  if step % evaluation_interval == 0:
    #logits_val, loss_val = m(xval, yval)
    #print(f"training loss: {loss.item()}, \
    #                                       validation loss {loss_val.item()}")
    losses = estimate_loss(number_of_evaluation_batches, m3, AB_string_3, block_size)
    print(f"step {step}: \n \
           loss averaged over {number_of_evaluation_batches} batches:\
           \n training loss: {losses['train']:.4f}, \
          \n validation loss {losses['validation']:.4f}\n\n")
losses = estimate_loss(number_of_evaluation_batches, m3, AB_string_3, block_size)
print(f"final loss: \n \
        loss averaged over {number_of_evaluation_batches} batches:\
        \n training loss: {losses['train']:.4f}, \
        \n validation loss {losses['validation']:.4f}\n\n")

current iteration loss, at step 0: 3.2831077575683594
step 0: 
            loss averaged over 10 batches:           
 training loss: 3.3302,           
 validation loss 3.3374


current iteration loss, at step 500: 0.893225908279419
step 500: 
            loss averaged over 10 batches:           
 training loss: 1.0081,           
 validation loss 1.1495


current iteration loss, at step 1000: 0.36434099078178406
step 1000: 
            loss averaged over 10 batches:           
 training loss: 0.1959,           
 validation loss 0.2853


current iteration loss, at step 1500: 0.036115728318691254
step 1500: 
            loss averaged over 10 batches:           
 training loss: 0.2230,           
 validation loss 0.0806


current iteration loss, at step 2000: 0.24428200721740723
step 2000: 
            loss averaged over 10 batches:           
 training loss: 0.0996,           
 validation loss 0.0669


current iteration loss, at step 2500: 0.010101783089339733
step 2500: 
            lo

In [8]:
context = ' '
context = AB_string_3.encode_new_text_as_tensor(context)
print(context)
context = torch.unsqueeze(context, dim=0)
print(context)
m3.set_verbose(False)
new_string = m3.generate(context,25)
print(new_string)
print(AB_string_3.decode(new_string[0].tolist()))

tensor([0])
tensor([[0]])
tensor([[ 0, 13, 14, 15, 16,  0, 17, 18, 19, 20, 21,  0, 22, 23, 24, 25, 26,  0,
          1,  2,  3,  4,  0,  5,  6,  7]])
 MNOP QRSTU VWXYZ ABCD EFG


In [5]:
# Next, tiny Shakespeare
print("shakespeare")
shakespeare = text("input.txt", is_file=True)
#shakespeare.from_file("input.txt")
print(f"chars, {shakespeare.chars}")
print(f"vocab size: {shakespeare.vocab_size}")
shakespeare.encode_text_as_tensor()
shakespeare.make_train_val_test(0.8, 0.1, 0.1)


shakespeare
reading text from file
chars, ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
vocab size: 65


  self.text = torch.tensor(encode(self.text), dtype = torch.long)


In [6]:
hidden_size = shakespeare.vocab_size*2
input_size = shakespeare.vocab_size
num_layers = 3
shakespeare_predictor = LanguageModel_LSTM(input_size, \
                                hidden_size, num_layers, verbose=False)

# check code is doing what I expect it to do. 
# Keep batch_size and block_size small to allow easy visualisation of 
# what is going on inside the network, to check that it's doing what I 
# think it's doing.

batch_size=32
block_size =100
number_of_evaluation_batches = 20
evaluation_interval = 500
learning_rate = 1e-3

# training loop. 
optimizer =\
      torch.optim.AdamW(shakespeare_predictor.parameters(), lr=learning_rate) 
for step in range(4000):
  # get a new batch
  xb, yb = shakespeare.get_batch(batch_size, block_size, 'train')
  logits, loss = shakespeare_predictor(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if step % evaluation_interval == 0: 
    print(f"current iteration loss, at step {step}: {loss.item()}")
  if step % evaluation_interval == 0:
    losses = estimate_loss(number_of_evaluation_batches, \
                           shakespeare_predictor, shakespeare, block_size)
    print(f"step {step}: \n \
           loss averaged over {number_of_evaluation_batches} batches:\
           \n training loss: {losses['train']:.4f}, \
          \n validation loss {losses['validation']:.4f}\n\n")
losses = estimate_loss(number_of_evaluation_batches,\
                        shakespeare_predictor, shakespeare, block_size)
print(f"final loss: \n \
        loss averaged over {number_of_evaluation_batches} batches:\
        \n training loss: {losses['train']:.4f}, \
        \n validation loss {losses['validation']:.4f}\n\n")

current iteration loss, at step 0: 4.160330772399902
step 0: 
            loss averaged over 20 batches:           
 training loss: 4.1506,           
 validation loss 4.1510


current iteration loss, at step 500: 2.5889129638671875
step 500: 
            loss averaged over 20 batches:           
 training loss: 2.5823,           
 validation loss 2.6899


current iteration loss, at step 1000: 2.3099141120910645
step 1000: 
            loss averaged over 20 batches:           
 training loss: 2.3156,           
 validation loss 2.2898


current iteration loss, at step 1500: 2.172563076019287
step 1500: 
            loss averaged over 20 batches:           
 training loss: 2.1323,           
 validation loss 2.1592


current iteration loss, at step 2000: 2.0868782997131348
step 2000: 
            loss averaged over 20 batches:           
 training loss: 1.9817,           
 validation loss 2.0954


current iteration loss, at step 2500: 1.9080678224563599
step 2500: 
            loss aver

In [7]:

# continue with another 10 000 steps - loss is coming down slowly
learning_rate = 1e-3

# training loop. 
optimizer =\
      torch.optim.AdamW(shakespeare_predictor.parameters(), lr=learning_rate) 
for step in range(10000):
  # get a new batch
  xb, yb = shakespeare.get_batch(batch_size, block_size, 'train')
  logits, loss = shakespeare_predictor(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if step % evaluation_interval == 0: 
    print(f"current iteration loss, at step {step}: {loss.item()}")
  if step % evaluation_interval == 0:
    losses = estimate_loss(number_of_evaluation_batches, \
                           shakespeare_predictor, shakespeare, block_size)
    print(f"step {step}: \n \
           loss averaged over {number_of_evaluation_batches} batches:\
           \n training loss: {losses['train']:.4f}, \
          \n validation loss {losses['validation']:.4f}\n\n")
losses = estimate_loss(number_of_evaluation_batches,\
                        shakespeare_predictor, shakespeare, block_size)
print(f"final loss: \n \
        loss averaged over {number_of_evaluation_batches} batches:\
        \n training loss: {losses['train']:.4f}, \
        \n validation loss {losses['validation']:.4f}\n\n")

current iteration loss, at step 0: 1.766512155532837
step 0: 
            loss averaged over 20 batches:           
 training loss: 1.9968,           
 validation loss 2.1066


current iteration loss, at step 500: 1.767135500907898
step 500: 
            loss averaged over 20 batches:           
 training loss: 1.6889,           
 validation loss 1.7798


current iteration loss, at step 1000: 1.7006516456604004
step 1000: 
            loss averaged over 20 batches:           
 training loss: 1.6787,           
 validation loss 1.7881


current iteration loss, at step 1500: 1.6458576917648315
step 1500: 
            loss averaged over 20 batches:           
 training loss: 1.6490,           
 validation loss 1.7993


current iteration loss, at step 2000: 1.6739484071731567
step 2000: 
            loss averaged over 20 batches:           
 training loss: 1.6966,           
 validation loss 1.7325


current iteration loss, at step 2500: 1.5582492351531982
step 2500: 
            loss aver

In [8]:
# current model seems to be decreasing in loss, albeit very slowly
# try some generative function and then train some more.
context = ' '
context = shakespeare.encode_new_text_as_tensor(context)
print(context)
context = torch.unsqueeze(context, dim=0)
print(context)
shakespeare_predictor.set_verbose(False)
new_string = shakespeare_predictor.generate(context,100)
print(new_string)
print(shakespeare.decode(new_string[0].tolist()))

tensor([1])
tensor([[1]])
tensor([[ 1, 57, 53, 50, 42, 47, 52, 45,  6,  0, 21, 57,  1, 57, 53,  1, 46, 43,
         56, 43,  1, 47, 57,  1, 57, 53,  1, 42, 39, 59, 45, 46, 58, 43, 56,  7,
         42, 39, 63, 43, 42,  1, 61, 47, 58, 46,  1, 63, 53, 59,  0, 21, 52,  1,
         17, 52, 45, 50, 39, 52, 42, 47, 52, 45,  1, 61, 39, 56, 57,  1, 53, 44,
          1, 51, 63,  1, 53, 47, 52,  1, 47, 52, 44, 47, 56, 51,  8,  0,  0, 30,
         21, 15, 20, 13, 30, 16, 10,  0, 25, 63,  1]])
 solding,
Is so here is so daughter-dayed with you
In Englanding wars of my oin infirm.

RICHARD:
My 


In [9]:
# continue with another 10 000 steps - loss is coming down slowly
learning_rate = 1e-3

# training loop. 
optimizer =\
      torch.optim.AdamW(shakespeare_predictor.parameters(), lr=learning_rate) 
for step in range(5000):
  # get a new batch
  xb, yb = shakespeare.get_batch(batch_size, block_size, 'train')
  logits, loss = shakespeare_predictor(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if step % evaluation_interval == 0: 
    print(f"current iteration loss, at step {step}: {loss.item()}")
  if step % evaluation_interval == 0:
    losses = estimate_loss(number_of_evaluation_batches, \
                           shakespeare_predictor, shakespeare, block_size)
    print(f"step {step}: \n \
           loss averaged over {number_of_evaluation_batches} batches:\
           \n training loss: {losses['train']:.4f}, \
          \n validation loss {losses['validation']:.4f}\n\n")
losses = estimate_loss(number_of_evaluation_batches,\
                        shakespeare_predictor, shakespeare, block_size)
print(f"final loss: \n \
        loss averaged over {number_of_evaluation_batches} batches:\
        \n training loss: {losses['train']:.4f}, \
        \n validation loss {losses['validation']:.4f}\n\n")

current iteration loss, at step 0: 1.3689426183700562
step 0: 
            loss averaged over 20 batches:           
 training loss: 1.5214,           
 validation loss 1.7030


current iteration loss, at step 500: 1.3954135179519653
step 500: 
            loss averaged over 20 batches:           
 training loss: 1.4450,           
 validation loss 1.5425


current iteration loss, at step 1000: 1.396178126335144
step 1000: 
            loss averaged over 20 batches:           
 training loss: 1.3731,           
 validation loss 1.5718


current iteration loss, at step 1500: 1.421913743019104
step 1500: 
            loss averaged over 20 batches:           
 training loss: 1.4122,           
 validation loss 1.5844


current iteration loss, at step 2000: 1.4302153587341309
step 2000: 
            loss averaged over 20 batches:           
 training loss: 1.3774,           
 validation loss 1.6352


current iteration loss, at step 2500: 1.3983981609344482
step 2500: 
            loss aver

In [10]:
# current model seems to be still decreasing in loss, albeit very slowly
context = ' '
context = shakespeare.encode_new_text_as_tensor(context)
print(context)
context = torch.unsqueeze(context, dim=0)
print(context)
shakespeare_predictor.set_verbose(False)
new_string = shakespeare_predictor.generate(context,150)
print(new_string)
print(shakespeare.decode(new_string[0].tolist()))

tensor([1])
tensor([[1]])
tensor([[ 1, 57, 54, 43, 39, 49,  1, 39, 50, 50, 53, 61,  8,  0,  0, 30, 13, 32,
         15, 24, 13, 33, 31, 10,  0, 21, 44,  1, 58, 46, 43, 52,  1, 46, 43,  1,
         41, 39, 52, 52, 53, 58,  1, 45, 43, 52, 58, 50, 43, 51, 64, 53, 56, 42,
         10,  0, 18, 53, 56,  1, 58, 46, 59, 57,  1, 44, 53, 59, 52, 42,  1, 46,
         47, 57,  1, 46, 39, 52, 42,  1, 53, 44,  1, 57, 59, 41, 49, 43, 57,  1,
         63, 53, 59, 56,  1, 61, 39, 63, 57,  5, 58,  0, 35, 47, 58, 46,  1, 58,
         46, 43,  1, 53, 56, 54, 50, 43, 57, 57, 47, 53, 52,  1, 57, 53, 52, 42,
          1, 44, 53, 56,  1, 53, 59, 56,  1, 57, 47, 45, 46,  8,  0, 21, 44,  1,
         21,  1, 42, 47, 42,  1, 52]])
 speak allow.

RATCLAUS:
If then he cannot gentlemzord:
For thus found his hand of suckes your ways't
With the orplession sond for our sigh.
If I did n


In [12]:
shakespeare_predictor.state_dict()

OrderedDict([('token_embedding_table.weight',
              tensor([[-0.3519,  0.8586,  0.2595,  ..., -0.2710,  0.5109,  0.2001],
                      [-0.0978,  0.1085,  0.2313,  ..., -0.4648,  0.3872,  0.2498],
                      [-0.3034,  0.6094,  0.0309,  ...,  0.1928,  0.4655,  0.5296],
                      ...,
                      [ 0.2113, -0.2238, -0.2096,  ...,  0.0266, -0.2321, -0.0184],
                      [ 0.7435, -0.1425,  0.1972,  ..., -0.1706, -0.1924, -0.4187],
                      [ 0.2101, -1.1720,  0.0276,  ..., -0.0334, -0.3433,  0.5295]])),
             ('lstm.weight_ih_l0',
              tensor([[ 1.2527, -0.7727,  0.0292,  ..., -0.2207,  0.1949,  0.1861],
                      [ 0.7808, -0.6094, -0.2140,  ...,  0.2757, -0.4334, -0.4543],
                      [ 0.2404,  0.2741, -0.3262,  ...,  0.7664,  0.0780,  0.1539],
                      ...,
                      [-0.4716,  0.1124,  0.2938,  ..., -0.0517, -0.2308,  0.3878],
                      

In [13]:
from pathlib import Path

# 1. Create models directory 
MODEL_PATH = Path("models")
MODEL_PATH.mkdir(parents=True, exist_ok=True)

# 2. Create model save path 
MODEL_NAME = "shakespeare_LSTM.pt"
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

# 3. Save the model state dict 
print(f"MODEL_SAVE_PATH: {MODEL_SAVE_PATH}")
torch.save(obj=shakespeare_predictor.state_dict(), # only saving the parameters
           f=MODEL_SAVE_PATH)

MODEL_SAVE_PATH: models\shakespeare_LSTM.pt
