# RNN(PyTorch) - Character-Level LSTM
* Project: To generate new text based on the text from the book
* Data source: Anna Karenina (attached is the book)
* Author: Shashi Kiran Chilukuri

### Importing libraries

In [2]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

### Loading data

In [5]:
with open('data/anna.txt', 'r') as f:
    text = f.read()
# Lets print firt 100 characters of the text
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

### Tokenization

In [6]:
chars = tuple(set(text))                          # To create a tuple of unique characters from text
int2char = dict(enumerate(chars))                 # Create dictionary of integer to chacter mapping
char2int = {ch: ii for ii, ch in int2char.items()}# Create dictionary of chacter to integer mapping
encoded = np.array([char2int[ch] for ch in text]) # Encode the text
# Lets print first 100 values of the characters of the text
encoded[:100]

array([63, 80, 42, 81,  0, 62, 48, 53, 56, 55, 55, 55, 76, 42, 81, 81,  7,
       53,  5, 42, 22,  8, 26,  8, 62, 13, 53, 42, 48, 62, 53, 42, 26, 26,
       53, 42, 26,  8, 50, 62, 78, 53, 62, 38, 62, 48,  7, 53, 51, 58, 80,
       42, 81, 81,  7, 53,  5, 42, 22,  8, 26,  7, 53,  8, 13, 53, 51, 58,
       80, 42, 81, 81,  7, 53,  8, 58, 53,  8,  0, 13, 53, 25, 17, 58, 55,
       17, 42,  7, 79, 55, 55, 24, 38, 62, 48,  7,  0, 80,  8, 58])

### One hot encoding

In [16]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [28]:
# Lets check after applying one hot encoder
one_hot_encode(encoded[:100].reshape(100,1), 83)

array([[[ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  1.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       ..., 
       [[ 0.,  0.,  0., ...,  1.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  0.,  0.,  0.]]], dtype=float32)

### Creating mini batches
* Starting sequence:
    arr = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]
* Number of sequences in a Batch (`Batch_size`)(N) = 2
    [1, 2, 3, 4, 5, 6, 7, 8]
    [9, 10,11,12,13,14,15,16]
* Sequence length(M) = 3
    1, 2, 3   4, 5, 6   ...
    7, 8, 9  10,11,12  ...  
* So from above step each batch contains N * M characters
* Total Number of batches(K) = len(arr)/(N * M)
* then just keep the arry = arr[:Total batches, Number of sequences(N) * Seq Length(M)]
* reshape that arry = arr.reshape(Number of sequences, -1)
* iterate through the array, one sequence at a time

In [32]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    # total number of batches we can make
    n_batches = len(arr)//(batch_size * seq_length)
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * (batch_size * seq_length)]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length] # The features
        y = np.zeros_like(x)       # The targets, shifted by one
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

### Testing get batches function

In [37]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

# printing
print('x\n',x[:15,:15])
print('y\n',y[:15,:15])

x
 [[63 80 42 81  0 62 48 53 56 55 55 55 76 42 81]
 [13 25 58 53  0 80 42  0 53 42  0  0 48 42  1]
 [62 58 68 53 25 48 53 42 53  5 25 62 33 53 80]
 [13 53  0 80 62 53  1 80  8 62  5 53  0 80 25]
 [53 13 42 17 53 80 62 48 53  0 62 42 48 66 13]
 [ 1 51 13 13  8 25 58 53 42 58 68 53 42 58 42]
 [53 70 58 58 42 53 80 42 68 53 13 42  8 68 53]
 [74 32 26 25 58 13 50  7 79 53 75 15 51  0 53]]
y
 [[80 42 81  0 62 48 53 56 55 55 55 76 42 81 81]
 [25 58 53  0 80 42  0 53 42  0  0 48 42  1  0]
 [58 68 53 25 48 53 42 53  5 25 62 33 53 80 62]
 [53  0 80 62 53  1 80  8 62  5 53  0 80 25 51]
 [13 42 17 53 80 62 48 53  0 62 42 48 66 13  0]
 [51 13 13  8 25 58 53 42 58 68 53 42 58 42 26]
 [70 58 58 42 53 80 42 68 53 13 42  8 68 53  0]
 [32 26 25 58 13 50  7 79 53 75 15 51  0 53 45]]


### Defining LSTM

self.lstm = nn.LSTM(input_size, n_hidden, n_layers, dropout=drop_prob, batch_first=True)

LSTM parametes: 
* input_size (the number of characters), 
* n_hidden (number of units in the hidden layers in the cell), 
* n_layers (a number of layers), 
* drop_prob (a dropout probability), and 
* batch_first (boolean.True, since when batching)

In [36]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

No GPU available, training on CPU; consider making n_epochs very small.


In [None]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        # Define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers,dropout=drop_prob, batch_first=True)
        
        # Define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # Define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## TODO: Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
        