Training on the text of Anna Karenina

Idea:
Passing one character at a time into an rnn.
One hot encod. This is then fed into a hidden layer
the hidden layer has 2 outputs: an output and a hidden state
The output goes to a final fully connected output layer which produces class scores in which we can apply a softmax function to produce the probability for the most likely next character


At the end the network is going to be able to generate new text, one character at a time



In [200]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F


In [201]:
with open('deep-learning-v2-pytorch/recurrent-neural-networks/char-rnn/data/anna.txt', 'r') as f:
    text = f.read()

In [202]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [203]:
# Turning the text into numerical tokens because the network
# can only learn from numerical data

# creating a unique vocab
chars = tuple(set(text))

# mapping every character to a unique integer
int2char = dict(enumerate(chars))

# getting a dictionary that goes from integers to characters
char2int = {}
for key, value in int2char.items():
    char2int[value] = key

In [204]:
# encoding
# get all the encoded words and add to the encoded list
encoded = []
for char in text[:15]:
    encoded.append(char2int[char])
    
encoded = np.array(encoded)

In [205]:
def one_hot_encoder(encoded_arr, n_labels):
    '''Takes in an encoded array and turns it into an encoded vector
    of a specific length'''
    # # initialize an array with zeros with 
    # columns=n_labels
    # rows for each character in the encoded arr = columns in the encoded array
    
    initial_arr = np.zeros([encoded_arr.shape[1], n_labels])
    
    # fill out the elements present in the encoded_arr with ones
    # position = 
    # array[row to add 1, column to add 1]
    initial_arr[np.arange(encoded_arr.shape[1]), encoded_arr.flatten()] = 1
    
    # reshaping to get the original array
    # adding another dimensionality
    one_hot = initial_arr.reshape((*encoded_arr.shape, n_labels))
    
    return one_hot

In [206]:
arr = np.arange(1,20,1)
batch_size = 2
seq_length = 3

In [207]:
batch_size_total = batch_size * seq_length

In [208]:
n_batches = len(arr)//batch_size_total

In [209]:
arr = arr[:n_batches * batch_size_total]

In [210]:
arr

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18])

In [211]:
def get_batches(arr, batch_size, seq_length):
    
    '''Create a generator that returns batches of size 
    batch_size x seq_length from arr.
    
    Arguments
    ---------
    arr: Array you want to make batches from
    batch_size: batch size, the number of sequences per batch
    seq_length: num of encoded chars in a sequence'''
    
    # calculating the num of characters in a mini batch
    batch_size_total = batch_size * seq_length
    
    # num of complete batches that we can make
    n_batches = len(arr)//batch_size_total
    
    
    # keep only enough characters to make full batches
    # some data may be lost here but generally it doesnt really matter
    arr = arr[:n_batches * batch_size_total]
    
    
    # reshape into batch_size rows
    # the -1 is just a dimension placeholder. It will automatically fill
    # up the second dimension to wathever size it needs to be
    # to accomodate all the data
    arr = arr.reshape((batch_size, -1))
    
    for n in range(0, arr.shape[1], seq_lenghth):
        # The features
        x = arr[:, n:n+seq_length]
        
        # the targets shifted by one
        y = np.zeros_like(x)
        
        try: 
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y
                   
    

In [212]:
train_on_gpu = torch.cuda.is_available()

if train_on_gpu:
    print('Training on GPU')
else:
    print('No GPU available')

Training on GPU


In [None]:
# Defining the model
class rnn(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001):
        
        supper().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        # define the layers of the model
        # len(self.chars: len of the one hot encoded input character)
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, droupout=drop_prob,
                           batch_size=True)
        self.dropout = nn.Droupout(drop_prob)
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
    def forward(self, x, hidden):
        
        # an lstm has a hidden and a cell state that are saved as tuple
        # in this function we are initializing the hidden weights to zero        
        r_out, hidden = self.lstm(x, hidden)
        out = self.dropout(r_out)
        out = r_out.view(-1, self.n_hidden)
        out = self.fc(r_out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                     weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
            else:
                hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                          weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
                
            return hidden