In [3]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Get the data

In [5]:
with open('../../Data/shakespeare.txt','r',encoding='utf8') as f:
    text = f.read()

text[:100]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose mi"

In [7]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [8]:
len(text)

5445609

### Encode entire text

In [9]:
all_characters = set(text)
all_characters

{'\n',
 ' ',
 '!',
 '"',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '}'}

In [10]:
# num ----> Letter
decoder = dict(enumerate(all_characters))
decoder

{0: ':',
 1: 'm',
 2: 'Y',
 3: '"',
 4: '\n',
 5: 'k',
 6: 'i',
 7: '>',
 8: '<',
 9: ' ',
 10: 'g',
 11: '9',
 12: '-',
 13: '}',
 14: 'C',
 15: 'V',
 16: 'P',
 17: 'S',
 18: ';',
 19: 'D',
 20: 'B',
 21: 'z',
 22: '8',
 23: '?',
 24: 'y',
 25: 'l',
 26: 'I',
 27: 'F',
 28: '7',
 29: 'L',
 30: 'n',
 31: '5',
 32: 'x',
 33: 'e',
 34: 'p',
 35: 'X',
 36: '0',
 37: 'R',
 38: 't',
 39: 'E',
 40: 'A',
 41: 'T',
 42: 'H',
 43: ',',
 44: ']',
 45: 'W',
 46: 'b',
 47: '|',
 48: 'a',
 49: '(',
 50: '_',
 51: '3',
 52: 'O',
 53: 'U',
 54: 'J',
 55: 'q',
 56: 'o',
 57: 'Q',
 58: 'j',
 59: 'd',
 60: 'N',
 61: 'h',
 62: 'G',
 63: 'Z',
 64: 'M',
 65: "'",
 66: 's',
 67: 'f',
 68: '2',
 69: '&',
 70: 'c',
 71: ')',
 72: '1',
 73: 'K',
 74: '`',
 75: 'u',
 76: '6',
 77: 'w',
 78: '.',
 79: 'r',
 80: '!',
 81: '[',
 82: 'v',
 83: '4'}

In [11]:
# Letter ----> num
encoder = {char:ind for ind, char in decoder.items()}
encoder

{':': 0,
 'm': 1,
 'Y': 2,
 '"': 3,
 '\n': 4,
 'k': 5,
 'i': 6,
 '>': 7,
 '<': 8,
 ' ': 9,
 'g': 10,
 '9': 11,
 '-': 12,
 '}': 13,
 'C': 14,
 'V': 15,
 'P': 16,
 'S': 17,
 ';': 18,
 'D': 19,
 'B': 20,
 'z': 21,
 '8': 22,
 '?': 23,
 'y': 24,
 'l': 25,
 'I': 26,
 'F': 27,
 '7': 28,
 'L': 29,
 'n': 30,
 '5': 31,
 'x': 32,
 'e': 33,
 'p': 34,
 'X': 35,
 '0': 36,
 'R': 37,
 't': 38,
 'E': 39,
 'A': 40,
 'T': 41,
 'H': 42,
 ',': 43,
 ']': 44,
 'W': 45,
 'b': 46,
 '|': 47,
 'a': 48,
 '(': 49,
 '_': 50,
 '3': 51,
 'O': 52,
 'U': 53,
 'J': 54,
 'q': 55,
 'o': 56,
 'Q': 57,
 'j': 58,
 'd': 59,
 'N': 60,
 'h': 61,
 'G': 62,
 'Z': 63,
 'M': 64,
 "'": 65,
 's': 66,
 'f': 67,
 '2': 68,
 '&': 69,
 'c': 70,
 ')': 71,
 '1': 72,
 'K': 73,
 '`': 74,
 'u': 75,
 '6': 76,
 'w': 77,
 '.': 78,
 'r': 79,
 '!': 80,
 '[': 81,
 'v': 82,
 '4': 83}

In [13]:
encoded_text = np.array([encoder[char] for char in text])
encoded_text[:100]

array([ 4,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9, 72,  4,  9,  9, 27, 79, 56,  1,  9, 67, 48,  6,
       79, 33, 66, 38,  9, 70, 79, 33, 48, 38, 75, 79, 33, 66,  9, 77, 33,
        9, 59, 33, 66,  6, 79, 33,  9,  6, 30, 70, 79, 33, 48, 66, 33, 43,
        4,  9,  9, 41, 61, 48, 38,  9, 38, 61, 33, 79, 33, 46, 24,  9, 46,
       33, 48, 75, 38, 24, 65, 66,  9, 79, 56, 66, 33,  9,  1,  6])

## One Hot Encoding

As previously discussed, we need to one-hot encode our data inorder for it to work with the network structure. Make sure to review numpy if any of these operations confuse you!

In [25]:
def one_hot_encoder(encoded_text, num_uni_chars):
    '''
    encoded_text : batch of encoded text
    
    num_uni_chars = number of unique characters (len(set(text)))
    '''
    #Create a placeholder for 0s
    one_hot = np.zeros((encoded_text.size, num_uni_chars))
    # print(f"The size is {one_hot.shape} and the element is {one_hot}")

    #Convert data type for the later use with the pytorch
    one_hot = one_hot.astype(np.float32)
    # print(one_hot)

    #Using fancy indexing fill in the 1s at the correct index location
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0
    # print(one_hot)

    #Reshape it so it matches the batch shape
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))
    # print(f"The size is {one_hot.shape} and the element is {one_hot}")
    

    return one_hot


In [24]:
one_hot_encoder(np.array([1,2,0]),3)

The size is (3, 3) and the element is [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]
The size is (3, 3) and the element is [[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

--------------
---------------
# Creating Training Batches

We need to create a function that will generate batches of characters along with the next character in the sequence as a label.

-----------------
------------

In [26]:
example_text = np.arange(10)
example_text

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [27]:
example_text.reshape((5,-1))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [38]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    '''
    Generate (using yield) batches for training.
    
    X: Encoded Text of length seq_len
    Y: Encoded Text shifted by one
    
    Example:
    
    X:
    
    [[1 2 3]]
    
    Y:
    
    [[ 2 3 4]]
    
    encoded_text : Complete Encoded Text to make batches from
    batch_size : Number of samples per batch
    seq_len : Length of character sequence
    '''
    # Total number of characters per batch
    # Example : If sample_per_batch is 2 and seq_len is 50 then, then 100
    # characters come out per batch
    char_per_batch = samp_per_batch * seq_len

    # Number of batches availabe to make
    # Use int() to return nearest integer
    num_batches_avail = int(len(encoded_text)/char_per_batch)

    # Cut off end of encoded text that won't fit evenly into a batch
    encoded_text = encoded_text[:num_batches_avail*char_per_batch]
    # print(encoded_text.shape)

    #Reshape text into rows the size of the batch
    encoded_text = encoded_text.reshape((samp_per_batch, -1))
    # print(encoded_text.shape)

    # Go through each row in the array
    for n in range(0, encoded_text.shape[1], seq_len):
        # Grab the feature character
        x = encoded_text[:, n:n+seq_len]

        # y is the target shifted over by 1
        y = np.zeros_like(x)

        try:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, n+seq_len]
        # FOR POTENTIAL INDEXING ERROR AT THE END 
        except:
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]
        yield x,y

### Example of generating a batch

In [36]:
sample_text = encoded_text[:20]
sample_text

array([4, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

In [37]:
batch_gernerator = generate_batches(sample_text, samp_per_batch=2, seq_len=5)
#Grab the first batch
x,y = next(batch_gernerator)
print(f"x : {x} \n y : {y}")

(20,)
(2, 10)
x : [[4 9 9 9 9]
 [9 9 9 9 9]] 
 y : [[9 9 9 9 9]
 [9 9 9 9 9]]


In [39]:
torch.cuda.is_available()

False

# Creating the LSTM Model

**Note! We will have options for GPU users and CPU users. CPU will take MUCH LONGER to train and you may encounter RAM issues depending on your hardware. If that is the case, consider using cloud services like AWS, GCP, or Azure. Note, these may cost you money to use!**

In [61]:
class CharModel(nn.Module):
    def __init__(self, all_chars, num_hidden=256, num_layers=4,drop_prob=0.5,use_gpu=False ):
        super().__init__()
        self.all_chars = all_chars
        self.num_hiddn = num_hidden
        self.num_layers = num_layers
        self.drop_prob = drop_prob
        self.use_gpu = use_gpu

        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char:ind for ind,char in decoder.items()}

        self.lstm = nn.LSTM(input_size=len(self.all_chars), hidden_size=num_hidden, num_layers=num_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))

    def forward(self, x, hidden):
        lstm_output, hidden = self.lstm(x, hidden)

        drop_output = self.dropout(lstm_output)

        drop_output = drop_output.contiguous().view(-1, self.num_hiddn)

        final_out = self.fc_linear(drop_output)

        return final_out, hidden
    
    def hidden_state(self, batch_size):
        '''
        Used as separate method to account for both GPU and CPU users.

        '''
        if self.use_gpu:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers, batch_size, self.num_hiddn),
                      torch.zeros(self.num_layers, batch_size, self.num_hiddn))
        return hidden
            

## Instance of the Model

In [62]:
model = CharModel(all_chars=all_characters,
                  num_hidden=512, num_layers=3, drop_prob=0.5, use_gpu=False)
model

CharModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

In [63]:
total_param = []
for p in model.parameters():
    total_param.append(p.numel())
sum(total_param)

5470292

In [64]:
len(encoded_text)

5445609

### Optimizer and loss

In [65]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

### Training data and validation data

In [66]:
# percentage of data to be used for training
train_percent = 0.1
print(len(encoded_text))
int(len(encoded_text)*train_percent)

5445609


544560

In [67]:
train_ind = int(len(encoded_text)*train_percent)
train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]

# Training the Network

## Variables

Feel free to play around with these values!

In [68]:
## VARIABLES

# Epochs to train for
epochs = 50
# Batch size
batch_size = 128
# Length of sequence
seq_len = 100
# For printing report purpose always start at 0
tracker = 0
#Number of character in text
num_char = max(encoded_text)+1


In [69]:
train_data.shape

(544560,)

In [70]:
batch_size

128

In [71]:
seq_len

100

In [72]:
num_char

84

In [74]:
#Set model to train 
model.train()

#Check to see if model is using GPU
if model.use_gpu:
    model.cuda()

for i in range(epochs):
    hidden = model.hidden_state(batch_size=batch_size)
    
    for x, y in generate_batches(train_data, batch_size, seq_len):
        tracker += 1
        # One hot encode incoming data
        x = one_hot_encoder(x, num_char)

        # Convert numy arrays to tensor
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)

        # Adjust for GPU if necessary
        if model.use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()
        
        # Reset hidden state
        # If we will not reset then we will backpropagate through all the training history
        hidden = tuple([state.data for state in hidden])

        model.zero_grad()

        lstm_output, hidden = model.forward(inputs, hidden)
        loss = criterion(lstm_output, targets.view(batch_size*seq_len).long())

        loss.backward()

        # Possibel exploding gradient problem
        # Let's clip just in case
        nn.utils.clip_grad_norm(model.parameters(), max_norm=5)
        optimizer.step()

        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################

        if tracker % 25 == 0:
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()

            for x,y in generate_batches(val_data,batch_size,seq_len):
                # One hot encode incoming data
                x = one_hot_encoder(x,num_char)
                # Convert numpy arrays to tensors
                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                # Adjust GPU if necessary
                if model.use_gpu:
                    inputs = inputs.cuda()
                    targets = targets.cuda()

                # Reset hidden state
                # If we don't reset then we would backpropagate through all training history
                val_hidden = tuple([state.data for state in val_hidden])

                lstm_output, val_hidden = model.forward(inputs, val_hidden)
                val_loss = criterion(lstm_output, targets.view(batch_size*seq_len).long())

                val_losses.append(val_loss.item())
            # Reset to training model after after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

  nn.utils.clip_grad_norm(model.parameters(), max_norm=5)


Epoch: 0 Step: 50 Val Loss: 3.234307289123535
Epoch: 1 Step: 75 Val Loss: 3.2133591175079346
Epoch: 1 Step: 100 Val Loss: 3.073793888092041
Epoch: 2 Step: 125 Val Loss: 2.9772727489471436
Epoch: 2 Step: 150 Val Loss: 2.835651159286499
Epoch: 3 Step: 175 Val Loss: 2.7252821922302246
Epoch: 4 Step: 200 Val Loss: 2.6104061603546143
Epoch: 4 Step: 225 Val Loss: 2.469093084335327
Epoch: 5 Step: 250 Val Loss: 2.3806159496307373
Epoch: 5 Step: 275 Val Loss: 2.321687698364258
Epoch: 6 Step: 300 Val Loss: 2.262159824371338
Epoch: 7 Step: 325 Val Loss: 2.2238433361053467
Epoch: 7 Step: 350 Val Loss: 2.1812431812286377
Epoch: 8 Step: 375 Val Loss: 2.1454873085021973
Epoch: 8 Step: 400 Val Loss: 2.116783857345581
Epoch: 9 Step: 425 Val Loss: 2.0916123390197754
Epoch: 10 Step: 450 Val Loss: 2.0678539276123047
Epoch: 10 Step: 475 Val Loss: 2.037212610244751
Epoch: 11 Step: 500 Val Loss: 2.0146727561950684
Epoch: 11 Step: 525 Val Loss: 2.000882863998413
Epoch: 12 Step: 550 Val Loss: 1.982614874839782

-------
------

## Saving the Model

https://pytorch.org/tutorials/beginner/saving_loading_models.html

In [75]:
# Be careful to overwrite our original name file!
model_name = 'example.net'

In [76]:
torch.save(model.state_dict(),model_name)

### Load the model

In [77]:
# MUST MATCH THE EXACT SAME SETTINGS AS MODEL USED DURING TRAINING!
model = CharModel(
    all_chars=all_characters,
    num_hidden=512,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=False
)
model.load_state_dict(torch.load(model_name))
model.eval()

  model.load_state_dict(torch.load(model_name))


CharModel(
  (lstm): LSTM(84, 512, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc_linear): Linear(in_features=512, out_features=84, bias=True)
)

# Generating Predictions

-----------------------------

In [81]:
# model.encoder
# len(model.all_chars)

In [91]:
def predict_next_char(model, char, hidden=None, k=1):
    # Encode raw letters with model
    encoded_text = model.encoder[char]
    # set as numpy array for one hot encoding
    # NOTE THE [[ ]] dimensions!!
    encoded_text = np.array([[encoded_text]])
    # One hot encoding
    encoded_text = one_hot_encoder(encoded_text=encoded_text,num_uni_chars=len(model.all_chars))
    # Convert to tensors
    inputs = torch.from_numpy(encoded_text)
    # Check for the CPU
    if(model.use_gpu):
        inputs = inputs.cuda()
    # Grab the hidden states
    hidden = tuple([state.data for state in hidden])
    # Run model 
    lstm_out, hidden = model(inputs, hidden)
    # Convert lstm output to probabilities
    probs = F.softmax(lstm_out, dim=1).data

    if(model.use_gpu):
        # move back to cpu to use with numpy
        probs = probs.cpu()

    # k determines how many characters to consider
    # for our probability choice.
    # https://pytorch.org/docs/stable/torch.html#torch.topk
    
    # Return k largest probabilities in tensor
    probs, index_positions = probs.topk(k)
    index_positions = index_positions.numpy().squeeze()

    # Create an arrays of probabilities
    probs = probs.detach().numpy().flatten()
    # Convert to probabilites per index
    probs = probs/probs.sum()
    # Randomly choose a character based on probabilities
    char = np.random.choice(index_positions, p=probs)
    # Return the encoded value of the predicted char and hidden state
    return model.decoder[char], hidden

In [94]:
def generate_text(model, size, seed='The', k=1):
    # Check for GPU
    if(model.use_gpu):
        model.cuda()
    else:
        model.cpu()
    # Evaluation mode
    model.eval()
    # Begin output from initial seed
    output_char = [c for c in seed]
    # Initiate hiddne state
    hidden = model.hidden_state(1)
    # Predict the next character for every character in the seed
    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k)

    # Add initial character to output
    output_char.append(char)
    # Now generate for size requested
    for i in range(size):
        # Predict based off very last letter in output_chars
        char, hidden = predict_next_char(model, output_char[-1], hidden, k=k)
        # Add predicted character
        output_char.append(char)
    # Return string of predicted text
    return ''.join(output_char)

In [97]:
print(generate_text(model, 1000, seed='The ', k=3))

The counter on the son.
    The soldier this the servicious of her swow.
    Where is not to the serving to my side,
    And what is some to thee we should have stand
    Than she in that the world of that I would be son
    To be to speak and song a senting show.
    I was not the word of the story with man
    Thou art a strange offer. That the charge
    That she shall see him that I will see thee
    A strength of the formen senst a cansee forgute,
    As the break of a poor for the with this shake
    And present my straight this were brother and him.
    We we consent me we see that to show them strong
    As to the strong as this were sone as so this
    The suble service of my fair foot this in mind
    And be the with the with.
  MENAS. Well, to-morrow, there.
    The grace is not my soldier, and we see
    That were a past of than and trith the come
    The way of soldier well. The gentle spirit
    As this in mark and see the common will,
    And to be brother, that will bri