### Prepare Environment

In [40]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

## Load Data

In [41]:
with open('data/anna.txt') as f:
    text=f.read()

In [42]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

## Tokenization

Step 1: By using **set**, a set of all unique characters is generated.

In [43]:
chars=set(text)
print('Text Length: ',len(text))
print('Number of Characters: ',len(chars))

Text Length:  1985223
Number of Characters:  83


In [44]:
chars=tuple(set(text))

Step 2: A dictionary is created by adding an index to the set of unique characters. 

In [45]:
int2dict=dict(enumerate(chars))

In [46]:
print(int2dict)

{0: 'J', 1: 'a', 2: 'N', 3: 'g', 4: '_', 5: '(', 6: 'b', 7: 'e', 8: 'A', 9: 'r', 10: '5', 11: 'G', 12: 'I', 13: 'd', 14: '0', 15: '3', 16: 'S', 17: ',', 18: '@', 19: 'K', 20: '.', 21: ')', 22: 'D', 23: '2', 24: 'R', 25: 'Y', 26: 'H', 27: 'B', 28: 'O', 29: '!', 30: 'L', 31: 'P', 32: '4', 33: 't', 34: 'j', 35: 'y', 36: 'l', 37: '&', 38: '6', 39: 'Z', 40: '%', 41: 'p', 42: '?', 43: 'h', 44: 'i', 45: 'M', 46: 'f', 47: 'v', 48: ';', 49: '`', 50: 'w', 51: '7', 52: 'V', 53: 'x', 54: 'c', 55: '1', 56: 'q', 57: 's', 58: "'", 59: '$', 60: 'F', 61: '8', 62: 'U', 63: '9', 64: '*', 65: 'k', 66: '"', 67: 'u', 68: '/', 69: 'z', 70: 'C', 71: '\n', 72: 'T', 73: 'W', 74: ' ', 75: 'X', 76: 'o', 77: 'n', 78: 'E', 79: 'm', 80: '-', 81: 'Q', 82: ':'}


Step 4: Index and character are switched. Thus, the text can be encoded to integer values by
using the character as a key.

In [47]:
char2int={ch: ind for ind,ch in int2dict.items()}

In [48]:
print(char2int)

{'J': 0, 'a': 1, 'N': 2, 'g': 3, '_': 4, '(': 5, 'b': 6, 'e': 7, 'A': 8, 'r': 9, '5': 10, 'G': 11, 'I': 12, 'd': 13, '0': 14, '3': 15, 'S': 16, ',': 17, '@': 18, 'K': 19, '.': 20, ')': 21, 'D': 22, '2': 23, 'R': 24, 'Y': 25, 'H': 26, 'B': 27, 'O': 28, '!': 29, 'L': 30, 'P': 31, '4': 32, 't': 33, 'j': 34, 'y': 35, 'l': 36, '&': 37, '6': 38, 'Z': 39, '%': 40, 'p': 41, '?': 42, 'h': 43, 'i': 44, 'M': 45, 'f': 46, 'v': 47, ';': 48, '`': 49, 'w': 50, '7': 51, 'V': 52, 'x': 53, 'c': 54, '1': 55, 'q': 56, 's': 57, "'": 58, '$': 59, 'F': 60, '8': 61, 'U': 62, '9': 63, '*': 64, 'k': 65, '"': 66, 'u': 67, '/': 68, 'z': 69, 'C': 70, '\n': 71, 'T': 72, 'W': 73, ' ': 74, 'X': 75, 'o': 76, 'n': 77, 'E': 78, 'm': 79, '-': 80, 'Q': 81, ':': 82}


Step 5: Encode text from characters to numbers. Generate a Numpy array out of it.

In [49]:
encoded=np.array([char2int[ch] for ch in text])

In [50]:
print(encoded)

[70 43  1 ... 57 20 71]


In [51]:
print(encoded[:100])

[70 43  1 41 33  7  9 74 55 71 71 71 26  1 41 41 35 74 46  1 79 44 36 44
  7 57 74  1  9  7 74  1 36 36 74  1 36 44 65  7 48 74  7 47  7  9 35 74
 67 77 43  1 41 41 35 74 46  1 79 44 36 35 74 44 57 74 67 77 43  1 41 41
 35 74 44 77 74 44 33 57 74 76 50 77 71 50  1 35 20 71 71 78 47  7  9 35
 33 43 44 77]


## Preprocessing the data: One Hot Encoding

Step 1: Generate function for one hot encoding. For every integer value from our dictionary a 
vector is generated. The number of labels (n_labels) corresponds to the length of our char2int-
dictionary and the number of columns in this new vector. The vector will be a row vector in which all values will be zero except one. This will be one. This value of one is on the position of the key of the character in our dictionary.

Example text: aba aaaac bbba accb
Dictionary: {'a':0,'b':1,'c':2}

One Hot vector length = 3 ([000])    => (there are 3 different characters in our example text)<br>
First letter [100]                   => (key 'a' has value 0 in our dictionary), second letter [010] and so on


In [100]:
def one_hot_encode(arr, n_labels):
    len_seq=len(arr[0])
    row_ct=len_seq
    one_hot=np.zeros((row_ct,n_labels),dtype=np.float32)
    print('One Hot Shape: ',one_hot.shape)
    one_hot[np.arange(row_ct),arr.flatten()]=1.
    one_hot=one_hot.reshape(*arr.shape,n_labels)
    return one_hot

### *Test:* One Hot Encoding

In [53]:
test_sequence=np.array([[1,2,5,4,3]])
one_hot_result=one_hot_encode(test_sequence,8)
print(one_hot_result)

[[[0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0.]]]


## Making Batches

In [114]:
def get_batches(arr,batch_size,seq_length):
    ''' Create a generator that returns batches of size 
        batch_size x seq_length from arr.
        
        Arguments:
        arr: text
        batch_size: Batch size = number of sequences per batch
        seq_lenght: Number of encoded chars in a sequence'''
    
    #Convert parameters to integer
    batch_size=int(batch_size)
    seq_length=int(seq_length)
    
    #Calculate total size of one batch (N x M where N=number of batches and M=seq_length)
    #Equal to number of characters in one batch
    batch_size_total=int(seq_length*batch_size)
    #Calculate number of batches in whole text by dividing number of all characters
    #by number of characters in one batch
    n_batches=len(arr)//batch_size_total
    print('Number of batches: ',n_batches)
    print('Total batch size: ',batch_size_total)
    #In the end, the text has to be cut. This means, that the characters at the end of the
    #text will be discarded because they will not fit in one whole batch and thus would
    #insert errors. The first character which will not fit in a batch anymore can be
    #computed by simply multiplying number of batches with number of characters per batch
    last_char=int(n_batches*batch_size_total)
    print('Characters that fit in batches: ',last_char)
    print('Original text: ',len(arr))
        
    #Slice text and cut off characters that do not fit in the last batch anymore.
    #So, every character is in one batch at the end.
    arr=arr[:last_char]
    
    #Text is reshaped now that only the characters fitting in one batch are left
    #The number of rows = number of batches (hyperparameter)
    #The number of columns depends on the batch_size and on the length of the whole text
    #(-1) allows this number of columns to be adjusted dynamically
    arr=arr.reshape((batch_size,-1))
    
    #Print Shape of final array which will be used for batch generation
    print('Arr-Shape', arr.shape)
    print('Number of Rows in total/one batch = Arr-Shape of Dimension 0: ', arr.shape[0])
    print('Number of Columns in total = Arr-Shape of Dimension 1:',arr.shape[1])
    print('Number of Columns in one batch = Seq-Length (Window size, x-Dimension): ', seq_length)
    
    number_columns_text=arr.shape[1]
    number_columns_batch=seq_length
    
    for n in range(0,number_columns_text,number_columns_batch):
        
        #Features (Characters)
        x=arr[:,n:n+number_columns_batch]
        print('x-Shape: ',x.shape)
        y=np.zeros_like(x)
        print('y-Shape: ',x.shape)
        print(y)
        try:
            y[:,:-1],y[:,-1]=x[:,1:],arr[:,n+number_columns_batch]
        except IndexError:
            y[:,:-1],y[:,-1]=x[:,1:],arr[:0]
        print('Cycle: ',n)
        print('X:',x)
        print('Y:',y)
        yield x,y
        

### Test Batch Making

In [115]:
batches=get_batches(encoded,8,50)
x,y=next(batches)

Number of batches:  4963
Total batch size:  400
Characters that fit in batches:  1985200
Original text:  1985223
Arr-Shape (8, 248150)
Number of Rows in total/one batch = Arr-Shape of Dimension 0:  8
Number of Columns in total = Arr-Shape of Dimension 1: 248150
Number of Columns in one batch = Seq-Length (Window size, x-Dimension):  50
x-Shape:  (8, 50)
y-Shape:  (8, 50)
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0

In [105]:
print(x[:10,:10])
print(y[:10,:10])

[[70 43  1 41 33  7  9 74 55 71]
 [57 76 77 74 33 43  1 33 74  1]
 [ 7 77 13 74 76  9 74  1 74 46]
 [57 74 33 43  7 74 54 43 44  7]
 [74 57  1 50 74 43  7  9 74 33]
 [54 67 57 57 44 76 77 74  1 77]
 [74  8 77 77  1 74 43  1 13 74]
 [28  6 36 76 77 57 65 35 20 74]]
[[43  1 41 33  7  9 74 55 71 71]
 [76 77 74 33 43  1 33 74  1 33]
 [77 13 74 76  9 74  1 74 46 76]
 [74 33 43  7 74 54 43 44  7 46]
 [57  1 50 74 43  7  9 74 33  7]
 [67 57 57 44 76 77 74  1 77 13]
 [ 8 77 77  1 74 43  1 13 74 57]
 [ 6 36 76 77 57 65 35 20 74 66]]


### *Test Batch Making:* Is Matrix Y shifted by one character?

In [106]:
test=0
if x[:1,1]==y[:1,0]:
    if x[2:3,5]==y[2:3,4]:
        if x[3:4,8]==y[3:4,7]:
            print("It worked. Frame is shifted. Y matrix is one step ahead of matrix X.")
            test=1
if test==0:
    print("Frame shift of y matrix did not work. Please review matrix Y generation.")

It worked. Frame is shifted. Y matrix is one step ahead of matrix X.


### Defining the Network with PyTorch

Step 1: Move to GPU if available

In [107]:
train_on_gpu=torch.cuda.is_available()
if (train_on_gpu):
    print('GPU detected. Will train on GPU!')
else:
    print('No GPU detected. CPU training!')

No GPU detected. CPU training!


Step 2: Define Architecture of Character-Level RNN

In [108]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        
        super().__init__()
        
        self.drop_prob=drop_prob
        self.n_layers=n_layers
        self.n_hidden=n_hidden
        self.lr=lr
        
        self.chars=tokens
        self.int2char=dict(enumerate(self.chars))
        self.char2int={ch: ii for ii,ch in self.int2char.items()}
        
        self.lstm=nn.LSTM(len(self.chars), n_hidden,n_layers,dropout=drop_prob,batch_first=True)
        self.dropout=nn.Dropout(drop_prob)
        self.fc=nn.Linear(n_hidden,len(self.chars))
        
    def forward(self,x,hidden):
        
        r_output,hidden=self.lstm(x,hidden)
        out=self.dropout(r_output)
        #Stack LSTM layers
        out=out.contiguous().view(-1,self.n_hidden)
        out=self.fc(out)
        
        return out,hidden
    
    def init_hidden(self, batch_size):
        
        weight=next(self.parameters()).data
        
        if (train_on_gpu):
             hidden=(weight.new(self.n_layers,batch_size,self.n_hidden).zero().cuda(),
                     weight.new(self.n_layers,batch_size,self.n_hidden).zero().cuda())
        else:
             hidden=(weight.new(self.n_layers,batch_size,self.n_hidden).zero_(),
                     weight.new(self.n_layers,batch_size,self.n_hidden).zero_()) 
        
        return hidden
    
        

## Train the LSTM RNN

In [122]:
def train_rnn(net,data,epochs=1,batch_size=10,seq_length=50,lr=0.001,
              clip=5,val_frac=0,print_every=10):
    
    net.train()
    
    #Define Hyperparameters Criterion,Optimizer
    opt=torch.optim.Adam(net.parameters(),lr=lr)
    criterion=nn.CrossEntropyLoss()
    
    #Split in Trainiing and Validation Data
    #val_frac=percentage of validation set (0.1=10%)
    val_idx=int(len(data)*(1-val_frac))
    data,val_data=data[:val_idx],data[:]
    
    #Move to GPU if available
    if train_on_gpu:
        net.cuda()
    
    counter=0
    n_chars=len(net.chars)
    
    for e in range(epochs):
        
        #initialize hidden state
        h=net.init_hidden(batch_size)
        
        for x,y in get_batches(data,batch_size,seq_length):
            
            #Epoch counter
            counter+=1
            
            #One Hot Encoding
            print('x: ',x.shape)
            print('n_chars: ',n_chars)
            x=one_hot_encode(x,n_chars)
            #Tranform Numpy arrays to tensors for input to network
            inputs,targets=torch.from_numpy(x),torch.from_numpy(y)
            
            #Move to GPU if available
            if train_on_gpu:
                inputs,targets=inputs.cuda(),targets.cuda()
            
            #Create new, empty hidden state to loose history
            h=tuple([each.data for each in h])
            
            #Delete Gradients
            net.zero_grad()
            
            #Forward Pass
            output,h=net(inputs,h)
            
            #Cost
            loss=criterion(output,targets.view(batch_size*seq_length))
            
            #Backpropation
            loss.backward() 
            nn.utils.clip_grad_norm(net.parameters(),clip)
            opt.step()
            
            if counter%print_every==0:
                
                val_hidden=net.init_hidden(batch_size)
                val_losses=[]
                
                net.eval()
                
                for x,y in get_batches(data,batch_size,seq_length):
                    
                    #One Hot Encoding
                    x=one_hot_encode(x,n_chars)
                    
                    #Transform Numpy arrays to tensors for input to network
                    x,y=torch.from_numpy(x),torch.from_numpy(y)
                    
                    if train_on_gpu:
                        inputs,targets=inputs.cuda(),targets.cuda()
                    
                    val_hidden=tuple([each.data for each in data])
                    
                    #Forward Pass
                    output,val_hidden=net(inputs,val_hidden)
                    
                    #Cost
                    val_loss=criterion(output,targets.view(batch_size*seq_length))
                    
                    #Extend list with validation losses
                    val_losses.append(val_loss.item())
                
                net.train()
                
                print("Epoch: {}/{}...".format(e+1,epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:,.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))
    
        

## Instantiate the Model

In [123]:
n_hidden=256
n_layers=2

net=CharRNN(chars,n_hidden,n_layers)

## Set Hyperparameters and Start Training

In [124]:
batch_size=8
seq_length=50
n_epochs=1

train_rnn(net,encoded,epochs=n_epochs,batch_size=batch_size,seq_length=seq_length,
     lr=0.001,print_every=1)

Number of batches:  4963
Total batch size:  400
Characters that fit in batches:  1985200
Original text:  1985223
Arr-Shape (8, 248150)
Number of Rows in total/one batch = Arr-Shape of Dimension 0:  8
Number of Columns in total = Arr-Shape of Dimension 1: 248150
Number of Columns in one batch = Seq-Length (Window size, x-Dimension):  50
x-Shape:  (8, 50)
y-Shape:  (8, 50)
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0

IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (50,) (400,) 