<a href="https://colab.research.google.com/github/TousifAhamed/Deep-Learning-Udacity-Nano-Degree/blob/master/char_level_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import torch 
from torch import nn
import torch.nn.functional as F

# Loading data

In [0]:
# open text file and read in data as text
with open('anna.txt','r') as f:
  text = f.read()

In [66]:
text



In [67]:
text[:100] # checking first 100 characters

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

# **Tokenization**

In [0]:
# Encode the text and map each character to and integer and viceversa
# We create two dictionaries:
# 1. int2char, which maps integers to characters
# 2. char2int, which maps characters to unique integers
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}
#int2char
#char2int

# encode the text

In [0]:
encoded = np.array([char2int[ch] for ch in text])

In [70]:
encoded

array([62, 81, 17, ..., 73, 15, 34])

In [71]:
encoded[1]

81

In [72]:
encoded[:100] # Characters encoded as integers

array([62, 81, 17, 35, 13, 58,  7, 77, 52, 34, 34, 34, 68, 17, 35, 35,  8,
       77, 16, 17, 11, 79,  6, 79, 58, 73, 77, 17,  7, 58, 77, 17,  6,  6,
       77, 17,  6, 79, 18, 58, 21, 77, 58, 26, 58,  7,  8, 77, 37, 36, 81,
       17, 35, 35,  8, 77, 16, 17, 11, 79,  6,  8, 77, 79, 73, 77, 37, 36,
       81, 17, 35, 35,  8, 77, 79, 36, 77, 79, 13, 73, 77, 47, 28, 36, 34,
       28, 17,  8, 15, 34, 34, 14, 26, 58,  7,  8, 13, 81, 79, 36])

# Preprocessing the **data**

In [0]:
import numpy as np
def one_hot_encode(arr, n_labels):
  # Initialize the encoded array
  print('Arr Shape',arr.shape)
  one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
  
  #print("After Initialization: ",one_hot)
  #print('one hot share',one_hot.shape[0])
  #print('One hot shape',np.arange(one_hot.shape[0]))
  #print('arr flatten', arr.flatten())
  # Fill the appropriate elements with ones
  one_hot[np.arange(one_hot.shape[0]),arr.flatten()] = 1.
  
  print('After Filling ones: ',one_hot)
  
  print('*arr shape',*arr.shape)
  print('n_labels',n_labels)
  print('One hot before reshape',one_hot.shape)
  # Finally reshape it to get back to the original array
  one_hot = one_hot.reshape((*arr.shape,n_labels))
  print('One hot after reshape',one_hot.shape)
  #print('Finally Reshape: ',one_hot)
  
  return one_hot    

In [74]:
# Check that function works as expected
test_seq = np.array([[3,5,1]])
print(80*'*')
print(test_seq.shape)
print(test_seq)
print(80*'*')
one_hot = one_hot_encode(test_seq, 8)
print(80*'*')
print(one_hot)

********************************************************************************
(1, 3)
[[3 5 1]]
********************************************************************************
Arr Shape (1, 3)
After Filling ones:  [[0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]]
*arr shape 1 3
n_labels 8
One hot before reshape (3, 8)
One hot after reshape (1, 3, 8)
********************************************************************************
[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


# Flatterning

numpy.ndarray.flatten
method

ndarray.flatten(order='C')
Return a copy of the array collapsed into one dimension.

Parameters:	
order : {‘C’, ‘F’, ‘A’, ‘K’}, optional
‘C’ means to flatten in row-major (C-style) order. ‘F’ means to flatten in column-major (Fortran- style) order. ‘A’ means to flatten in column-major order if a is Fortran contiguous in memory, row-major order otherwise. ‘K’ means to flatten a in the order the elements occur in memory. The default is ‘C’.

Returns:	
y : ndarray
A copy of the input array, flattened to one dimension.


In [0]:
a = np.array([[1,2],[3,4]])

In [76]:
a

array([[1, 2],
       [3, 4]])

In [77]:
a.shape

(2, 2)

In [78]:
a.flatten()

array([1, 2, 3, 4])

In [79]:
a.flatten('F') # Row major flattening

array([1, 3, 2, 4])

In [80]:
a.flatten('C') # Column major flattening

array([1, 2, 3, 4])

In [81]:
np.arange(3)

array([0, 1, 2])

# Making training mini batches

In [0]:
def get_batches(arr,batch_size,seq_length):
  
  batch_size_total = batch_size * seq_length
  # total number of batches we make
  n_batches = len(arr)//batch_size_total
  #Keep only unique character to make full batches
  arr = arr[:n_batches * batch_size_total]
  #Reshape into batch_size rows
  arr = arr.reshape((batch_size, -1))
  #iterate through the array , one sequence at a time 
  for n in range(0, arr.shape[1], seq_length):
    # The features
    x = arr[:,n:n+seq_length]
    # The targets, shifted by one
    y = np.zeros_like(x)
    
    try:
      y[:,:-1], y[:, -1] = x[:,1:],arr[:,n+seq_length]
    except IndexError:
      y[:,:-1], y[:,-1] = x[:,-1:],arr[:,0]
    yield x, y
    

In [0]:
batches = get_batches(encoded, 8, 50)

In [0]:
x,y = next(batches)

In [85]:
# printing out the first 10 items in a sequence
print('x\n',x[:10,:10])

x
 [[62 81 17 35 13 58  7 77 52 34]
 [73 47 36 77 13 81 17 13 77 17]
 [58 36 30 77 47  7 77 17 77 16]
 [73 77 13 81 58 77 12 81 79 58]
 [77 73 17 28 77 81 58  7 77 13]
 [12 37 73 73 79 47 36 77 17 36]
 [77  1 36 36 17 77 81 17 30 77]
 [22 44  6 47 36 73 18  8 15 77]]


In [86]:
print('y\n',y[:10,:10])

y
 [[81 17 35 13 58  7 77 52 34 34]
 [47 36 77 13 81 17 13 77 17 13]
 [36 30 77 47  7 77 17 77 16 47]
 [77 13 81 58 77 12 81 79 58 16]
 [73 17 28 77 81 58  7 77 13 58]
 [37 73 73 79 47 36 77 17 36 30]
 [ 1 36 36 17 77 81 17 30 77 73]
 [44  6 47 36 73 18  8 15 77 80]]


# Defining Network with Pytorch

In [87]:
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
  print('Training on GPU')
else:
  print('No GPU available, training on CPU; consider making n_epochs very small')

Training on GPU


In [0]:
class CharRNN(nn.Module):
  
  def __init__(self,tokens, n_hidden=256,n_layers=2,drop_prob = 0.5,lr = 0.001):
    super().__init__()
    self.drop_prob = drop_prob
    self.n_layers = n_layers
    self.n_hidden = n_hidden
    self.lr = lr
    
    # creating character dictionary
    self.chars = tokens
    self.int2char = dict(enumerate(self.chars))
    self.char2int = {ch: ii for ii, ch in self.int2char.items()}
    
    #TODO: define the LSTM
    self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout = drop_prob, batch_first = True)
    
    #TODO: define a dropout layer
    self.dropout = nn.Dropout(drop_prob)
    
    #TODO: define the final, fully connected layer 
    self.fc = nn.Linear(n_hidden, len(self.chars))
    
    
  def forward(self, x, hidden):
    ''' 
    Forward pass through the network .
    These inputs are x, and the hidden/cell state hidden
    '''
    
    ## Get the output and the new hidden state from the lstm
    r_output, hidden = self.lstm(x, hidden)
    
    ## Pass through the dropout layer
    out = self.dropout(r_output)
    
    ## Stack up LSTM outputs using view 
    # you may need contiguous to reshape the output 
    out = out.contiguous().view(-1,self.n_hidden)
    
    ## Todo: put x through the fully connected layer 
    out = self.fc(out)
    
    # return the final output and hidden state
    return out, hidden
  
  def init_hidden(self, batch_size):
    ''' Initialize hidden state '''
    # Create a two 
    weight = next(self.parameters()).data
    
    if(train_on_gpu):
      hidden = (weight.new(self.n_layers,batch_size, self.n_hidden).zero_().cuda(),
               weight.new(self.n_layers,batch_size,self.n_hidden).zero_().cuda())
    else:
      hidden = (weight.new(self.n_layers,batch_size, self.n_hidden).zero_(),
                weight.new(self.n_layers,batch_size,self.n_hidden).zero_())
      
      return hidden

In [0]:
def train(net, data, epochs=10, batch_size=10, seq_length= 50, lr=0.001,clip=5,val_frac=0.1,print_every = 10):
  '''
    Training a network:
    Arguments
    ---------
    net: CharRNN network
    data: text data to train the network
    epochs: Number of epochs to train 
    batch_size: Number of mini-sequences per mini-batch, aka batch size
    seq_length: Number of character steps per mini batch
    lr: learning rate
    clip: gradient clipping 
    val_frac: Fraction of data to hold out for validation
    print_every: Number of steps for printing training and validation loss
    
  '''
  net.train()
  
  opt = torch.optim.Adam(net.parameters(), lr=lr)
  criterion = nn.CrossEntropyLoss()
  
  # Create training and validation data
  val_idx = int(len(data) * (1-val_frac))
  data, val_data = data[:val_idx], data[val_idx:]
  
  if(train_on_gpu):
    net.cuda()
    
  counter = 0
  n_chars = len(net.chars)
    
  for e in range(epochs):
    #initialize hidden state
    h = net.init_hidden(batch_size)
    
    for x, y in get_batches(data, batch_size, seq_length):
      counter += 1
      
      # One-hot encode our data and make them Torch tensors
      x = one_hot_encode(x, n_chars)
      inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
      
      if(train_on_gpu):
        inputs, targets = inputs.cuda(), targets.cuda()
        
      # Creating new variables for the hidden state, otherwise
      # we'd backpropagate entire history
      
      h = tuple([each.data for each in h])
      
      # zero accumalated gradients
      net.zero_grad()
    
      # get the output from model
      output, h = net(inputs, h)
      
      # calculate the loss and perform backup
      loss = criterion(output, targets.view(batch_size*seq_length).long())
      loss.backward()
      
      # 'clip_grad_norm' helps prevent the exploding gradient problem in RNNs / LSTM
      nn.utils.clip_grad_norm_(net.parameters(),clip)
      opt.step()
      
      # loss stats
      if counter % print_every == 0:
        # get valuation loss
        val_h = net.init_hidden(batch_size)
        val_losses = []
        net.eval()
        
        for x, y in get_batches(val_data, batch_size, seq_length):
          # one hot encode our data and make them Torch tensors
          x = one_hot_encode(x, n_chars)
          x, y = torch.from_numpy(x), torch.from_numpy(y)
          
          '''
          Creating new variables for hidden state, otherwise
          we'd backprop through the entire training history
          '''
          val_h = tuple([each.data for each in val_h])
          
          inputs, targets = x, y
          if(train_on_gpu):
            inputs, targets = inputs.cuda(),targets.cuda()
            
          output, val_h = net(inputs,val_h)
          val_loss = criterion(output, targets.view(batch_size*seq_length).long())
          
          val_losses.append(val_loss.item())
          
      net.train() # reset to train mode after iterating through  validationdata
      
      print("Epoch: {}/{}...".format(e+1,epochs),
           "Step: {}...".format(counter),
           "Loss: {:.4f}".format(loss.item()),
           "Val Loss: {:.4f}".format(np.mean(val_losses)))
      
          
      
      
  

# Instantiating the model

In [90]:
# define and pring the net
n_hidden = 512
n_layers = 2
net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [91]:
batch_size = 128
seq_length = 100
n_epochs = 20 # Start smaller if you are just testing inital behaviour

# train the model
train(net,encoded,epochs = n_epochs, batch_size = batch_size, seq_length = seq_length, lr = 0.001, print_every=10)

Arr Shape (128, 100)
After Filling ones:  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
*arr shape 128 100
n_labels 83
One hot before reshape (12800, 83)
One hot after reshape (128, 100, 83)


TypeError: ignored