# LSTM in PyTorch.
Character level RNN for generating text based on input on which it has been trained.
Important links<br>
- [Andrej Karpathy post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)
- [Understanding LSTM Networks](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
- [Exploring LSTM](http://blog.echen.me/2017/05/30/exploring-lstms/)
- [Geoffrey Hinton ppts.](http://www.cs.toronto.edu/~hinton/talks.html)

General architecture of the network:
![Architecture](assets/charseq.jpeg "RNN")



## 0. Imports


In [1]:
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

In [2]:
# Loading the nobel
PATH = 'data/'
with open(PATH+'anna.txt' ,'r') as f:
  text = f.read()

print(text[:50])

Chapter 1


Happy families are all alike; every un


## 1. Tokenize and make data data ready.

In [13]:
# Vocabulary
chars = tuple(set(text))
# Dictionaries for mapping characters to integers and vice versa.
int2char = dict(enumerate(chars))
char2int = {c:i for i,c in int2char.items()}
# Encode entire text to numbers i.e each character with its numerical value.
encoded = np.array([char2int[ch] for ch in text])

print('char2int:\n{}\n\nint2char:\n{}\n\nLength:{}\n\nEncoded:\n{}\n\n'.format(char2int,int2char,len(encoded),encoded[:50]))

def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Reshape
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

char2int:
{'E': 0, ':': 1, '4': 2, '@': 3, '$': 4, 'S': 5, '9': 6, 'm': 7, 'F': 8, 'T': 9, 'B': 10, 'y': 11, 'K': 12, 'b': 13, 'I': 14, '%': 15, '&': 16, '"': 17, '\n': 18, 'w': 19, '/': 20, 'Q': 21, 'V': 22, 'L': 23, 'u': 24, 'n': 25, 'o': 26, 'D': 27, 'X': 28, 'Z': 29, '!': 30, 't': 31, '?': 32, 'h': 33, 'r': 34, 'M': 35, 'O': 36, 'j': 37, 'H': 38, 'W': 39, '`': 40, 'a': 41, '-': 42, ' ': 43, 'G': 44, 'z': 45, 'C': 46, 'd': 47, 'J': 48, '6': 49, 'k': 50, '1': 51, 'c': 52, '0': 53, 'P': 54, 'N': 55, 'R': 56, '_': 57, '(': 58, 'Y': 59, ',': 60, 'g': 61, '2': 62, '7': 63, '*': 64, 'v': 65, 'U': 66, '.': 67, 'e': 68, 's': 69, 'l': 70, 'x': 71, 'q': 72, '5': 73, 'p': 74, ';': 75, '8': 76, '3': 77, 'f': 78, 'i': 79, "'": 80, 'A': 81, ')': 82}

int2char:
{0: 'E', 1: ':', 2: '4', 3: '@', 4: '$', 5: 'S', 6: '9', 7: 'm', 8: 'F', 9: 'T', 10: 'B', 11: 'y', 12: 'K', 13: 'b', 14: 'I', 15: '%', 16: '&', 17: '"', 18: '\n', 19: 'w', 20: '/', 21: 'Q', 22: 'V', 23: 'L', 24: 'u', 25: 'n', 26: 'o', 27: '

## 2. Minibatches
![Batches](assets/sequence_batching_ex.png "Batches")

In [0]:
def get_batches(arr, batch_size, seq_length):
  '''
  Generator which returns batch of size : batch_size*seq_length
  '''
  chars_batch = batch_size * seq_length
  n_batches = len(arr) // chars_batch
  
  # Make full batches
  arr = arr[: n_batches * chars_batch]
  # Resize the array
  arr = arr.reshape((batch_size,-1))
  # Get Batches
  for i in range(0, arr.shape[1], seq_length):
    # Features
    x = arr[:, i:i + seq_length]
    # Targets shifted by One in future
    y = np.zeros_like(x)
    try:
      y[:,:-1], y[:,-1] = x[:,1:], arr[:,i+seq_length]
    except IndexError:
      y[:,:-1], y[:,-1] = x[:,1:], arr[:,0]
    
    yield x,y
    

In [36]:
# TEST RUN
batches = get_batches(encoded, 8, 50)
x, y = next(batches)
# printing out the first 10 items in a sequence
print('Shape', x.shape)
print('x\n',x)
print('\ny\n', y[:, :10])

Shape (8, 50)
x
 [[82  7 17 47 65 62  8 63 41 57 57 57 56 17 47 47 74 63 59 17 67 51 16 51
  62 52 63 17  8 62 63 17 16 16 63 17 16 51 40 62 75 63 62 66 62  8 74 63
  76 70]
 [52 55 70 63 65  7 17 65 63 17 65 65  8 17 25 65 62 72 63  7 62  8 63 17
  65 65 62 70 65 51 55 70 63 22 17 52 63  7 62  8 63  7 76 52 68 17 70 72
  31 63]
 [62 70 72 63 55  8 63 17 63 59 55 62  3 63  7 62 63 17 66 55 51 72 62 72
  63  7 51 52 63 59 17 65  7 62  8 31 63 56 62 57 16 55 55 40 62 72 63  8
  55 76]
 [52 63 65  7 62 63 25  7 51 62 59 63 65  7 55 76 30  7 63  7 51 72 72 62
  70 57 51 70 65 62  8 62 52 65 63 55 59 63  7 51 52 63 16 51 59 62  3 63
  55 59]
 [63 52 17 22 63  7 62  8 63 65 62 17  8 27 52 65 17 51 70 62 72  3 63 47
  51 65 51 59 76 16  3 63 52 22 62 62 65 63 59 17 25 62  3 57 67 51 52 62
   8 17]
 [25 76 52 52 51 55 70 63 17 70 72 63 17 70 17 16 74 52 51 52  3 63 22 17
  52 63 51 70 63 47  8 51 70 25 51 47 16 62 63 72 51 52 17 30  8 62 62 17
  68 16]
 [63 79 70 70 17 63  7 17 72 63 52 17 51 

In [10]:
# GPU check
gpu_is = torch.cuda.is_available()
if(gpu_is):
  print('Training on GPU')
else:
  print('NO GPU | Training on CPU')

NO GPU | Training on CPU


## 3. Defining Network

In [7]:
class CharRNN(nn.Module):
  def __init__(self, tokens, n_hidden = 256, n_layers = 2, drop_prob = 0.3, lr = 0.001):
    super().__init__()
    self.drop_prob = drop_prob
    self.n_hidden = n_hidden
    self.n_layers = n_layers
    self.lr = lr
    
    # Character dictionaries
    self.chars = tokens
    self.int2char = dict(enumerate(self.chars))
    self.char2int = {c:i for i,c in self.int2char.items()}
    
    # LSTM 
    self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout = drop_prob, batch_first = True)
    # Dropout
    self.dropout = nn.Dropout(drop_prob)
    # Fully connected final output
    self.fc = nn.Linear(n_hidden, len(self.chars))
    
  def forward(self, x, hidden):
    '''
    Forward pass through the network. x - Input, hidden is the cell state or hidden state.(memory).  
    '''
    # Output from LSTM
    r_out, hidden = self.lstm(x, hidden)
    out = self.dropout(r_out)
    out = out.contiguous().view(-1,self.n_hidden)
    out = self.fc(out)
    return out, hidden
  
  def init_hidden(self, batch_size):
    '''
    Initialize hidden state. 
    '''
    weight = next(self.parameters()).data
    if (gpu_is):
      hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
              weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
    else:
      hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
    return hidden
    

In [5]:
# Train

def train(net, data, epochs = 10, batch_size = 10, seq_length = 50, lr = 0.001, clip = 5, val_frac = 0.2, print_every=15):
  '''
  Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss.
  '''
  net.train()
  opt = torch.optim.Adam(net.parameters(), lr = lr)
  criterion = nn.CrossEntropyLoss()
  
  # training and validation data
  val_idx = int(len(data) * (1 - val_frac))
  data, val_data = data[:val_idx], data[val_idx:]
  
  if (gpu_is):
    net.cuda()
  
  counter = 0
  n_chars = len(net.chars)
  for e in range(epochs):
    h = net.init_hidden(batch_size)
    for x,y in get_batches(data, batch_size, seq_length):
      counter += 1
      x = one_hot_encode(x, n_chars)
      inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
      if (gpu_is):
        inputs, targets = inputs.cuda(), targets.cuda()
      # Creating new variable for the hidden state.
      # otherwise it would back propogate through the entire history
      h = tuple([each.data for each in h])
      
      # Zero accumulated Gradients
      net.zero_grad()
      
      output, h = net.forward(inputs, h)
      loss = criterion(output, targets.view(batch_size * seq_length).long())
      loss.backward()
      
      # Clip gradients to overcome the exploding gradient problem in RNNs
      nn.utils.clip_grad_norm_(net.parameters(), clip)
      opt.step()
      # LOSS STATS
      if counter % print_every == 0:
        net.eval()
        val_h = net.init_hidden(batch_size)
        val_losses = []
        for x,y in get_batches(val_data, batch_size, seq_length):
          x = one_hot_encode(x, n_chars)
          inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
          if (gpu_is):
            inputs, targets = inputs.cuda(), targets.cuda()
          val_h = tuple([each.data for each in val_h])
          output, val_h = net.forward(inputs, val_h)
          val_loss = criterion(output, targets.view(batch_size * seq_length).long())
          val_losses.append(val_loss.item())
          
        net.train()
        print("Epoch: {}/{} ...".format(e+1, epochs),
              "Step: {} ...".format(counter),
              "Loss: {:.4f} ...".format(loss.item()),
              "Val Loss: {:.4f}".format(np.mean(val_losses)))
          
      
      
      

In [7]:
# define and print the net

n_hidden=512
n_layers=2
dropout = 0.4
net = CharRNN(chars, n_hidden, n_layers,dropout)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.4)
  (dropout): Dropout(p=0.4)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


## 4. Train

In [68]:
%%time
batch_size = 128
seq_length = 100
n_epochs = 20 # start smaller if you are just testing initial behavior

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=50)

Epoch: 1/20 ... Step: 50 ... Loss: 3.1179 ... Val Loss: 3.1165
Epoch: 1/20 ... Step: 100 ... Loss: 2.9335 ... Val Loss: 2.9379
Epoch: 2/20 ... Step: 150 ... Loss: 2.5052 ... Val Loss: 2.4835
Epoch: 2/20 ... Step: 200 ... Loss: 2.3007 ... Val Loss: 2.3130
Epoch: 3/20 ... Step: 250 ... Loss: 2.1763 ... Val Loss: 2.1645
Epoch: 3/20 ... Step: 300 ... Loss: 2.0873 ... Val Loss: 2.0495
Epoch: 3/20 ... Step: 350 ... Loss: 1.9866 ... Val Loss: 1.9478
Epoch: 4/20 ... Step: 400 ... Loss: 1.8681 ... Val Loss: 1.8628
Epoch: 4/20 ... Step: 450 ... Loss: 1.8255 ... Val Loss: 1.7939
Epoch: 5/20 ... Step: 500 ... Loss: 1.7616 ... Val Loss: 1.7325
Epoch: 5/20 ... Step: 550 ... Loss: 1.7071 ... Val Loss: 1.6814
Epoch: 5/20 ... Step: 600 ... Loss: 1.6754 ... Val Loss: 1.6415
Epoch: 6/20 ... Step: 650 ... Loss: 1.6234 ... Val Loss: 1.6050
Epoch: 6/20 ... Step: 700 ... Loss: 1.5930 ... Val Loss: 1.5691
Epoch: 7/20 ... Step: 750 ... Loss: 1.5473 ... Val Loss: 1.5448
Epoch: 7/20 ... Step: 800 ... Loss: 1.528

## 5. Save

In [0]:
model_name = 'char_rnn.net'
checkpoint = {'n_hidden':net.n_hidden,
             'n_layers':net.n_layers,
             'state_dict':net.state_dict(),
             'tokens':net.chars}
with open(PATH + model_name, 'wb') as f:
  torch.save(checkpoint, f)

## 6. Predict and generate text

In [4]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(gpu_is):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(gpu_is):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

In [5]:
def sample(net, size, prime='The', top_k=None):
        
    if(gpu_is):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # Prime word
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [78]:
print(sample(net, 1000, prime='Anna', top_k=5))

Anna, he would not be as to be, tryented that the
position was necessary on her for her fingers in her mother. And an her there were no posicion
that he would
say, and would be ten and
support. And the doctor and wish alone to the stares without
an incloved to him, with all the sard to her shame in a charm and hope, and took his shout that thought which he saw the childree and the
man and he had to be talking of the subject of that serious tables and things with her shade of the men in his hand. The sound of the same time, and he wanted
to take his former this part where his best to go on, and had so going, she could
not say, the court how he would not take of him work in half, and with a stretch hore his eyes shirk watcing a little time when the conversation was the steps to see the
proper stairs on another shame with
surery
and seemed in his baby and the
point of him. He had to
distract the conversation in the past of his hand, and had askong him with what he was the fear for a splan

In [8]:
# LOAD model and check
model_name = 'char_rnn.net'
with open(PATH+model_name, 'rb') as f:
  checkpoint = torch.load(f,map_location='cpu') # Remove map_location if using GPU.
  
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'], drop_prob=0.4)
loaded.load_state_dict(checkpoint['state_dict'])
print(loaded)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.4)
  (dropout): Dropout(p=0.4)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [11]:
# Sample using a loaded model
# Helper functions required are...
# predict, sample, one_hot_encode, Network architecture.
print(sample(loaded, 2000, top_k=5, prime="The boy asked"))

The boy asked him, and at the table, and his boots of the prestraiting
close of her eyes the starried settless and the conversation, as he was
striking at the creat of them in her mind a step to her hand and terrously before anything before the charm, which was
at the childroo at
that mamening--his son.

"I won't go to see."

"Why so and I have said anything," said Anna, smiling,
walking away her head.

"You can't say that I can do nothing with me to think of my heart," he answered, and as she was at once time that she went up tightly
by a second
princess. She had been so that the mere sent of that with his soul, and he said something to
him with a
subder and take a finger and his, at a land would
be true, and he was thinking to this since anything of the state of his feet. She did not
stay her head to his story out of the steps and the same the princess, with her head again.

Sergey Ivanovitch was a conviction what he saw happiness whether the stants steps, a peacant child, which was 