## Imports

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import re
import pickle
import os
from collections import Counter

# Load Data

In [2]:
with open('./data/valid.txt', 'r', encoding='utf8') as f:
    text = f.read()

# Pre-processing

In [3]:
def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of motivational words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    word_counts = Counter(text)
    
    # sorting the words from most to least frequent in text occurrence
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return (vocab_to_int, int_to_vocab)


def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenized dictionary where the key is the punctuation and the value is the token
    """
    tokens = dict()
    tokens['.'] = '<PERIOD>'
    tokens[','] = '<COMMA>'
    tokens['"'] = '<QUOTATION_MARK>'
    tokens[';'] = '<SEMICOLON>'
    tokens['!'] = '<EXCLAMATION_MARK>'
    tokens['?'] = '<QUESTION_MARK>'
    tokens['('] = '<LEFT_PAREN>'
    tokens[')'] = '<RIGHT_PAREN>'
    tokens['?'] = '<QUESTION_MARK>'
    tokens['-'] = '<DASH>'
    tokens['\n'] = '<NEW_LINE>'
    return tokens 

In [4]:
SPECIAL_WORDS = {'PADDING': '<PAD>'}
def preprocess_and_save_data(dataset_path):
    """
    Preprocess Text Data
    """
    
    input_file = os.path.join(dataset_path)
    with open(input_file, 'r', encoding='utf8') as f:
        text = f.read()

    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))

## Save processed data

In [5]:
preprocess_and_save_data('./data/valid.txt')

## Load check point

In [6]:
def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    return pickle.load(open('preprocess.p', mode='rb'))

In [7]:
int_text, vocab_to_int, int_to_vocab, token_dict = load_preprocess()

# Batching

In [8]:
train_on_gpu = torch.cuda.is_available()

In [9]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size):
    """
    Batch the neural network data using DataLoader
    :param words: The word ids 
    :param sequence_length: The sequence length of each batch
    :param batch_size: The size of each batch; the number of sequences in a batch
    :return: DataLoader with batched data
    """
    n_batches = len(words)//batch_size

    words = words[:n_batches*batch_size]
    y_len = len(words) - sequence_length
    x, y = [], []
    
    for idx in range(0, y_len):
        idx_end = sequence_length + idx
        x_batch = words[idx:idx_end]
        x.append(x_batch)
        batch_y =  words[idx_end]
        y.append(batch_y)    
  
    data = TensorDataset(torch.tensor(x), torch.tensor(y))
   
    data_loader = DataLoader(data, shuffle=True, batch_size=batch_size)
    return data_loader    

# Building LSTM

In [10]:
class RNN(nn.Module): 
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5, lr=0.001):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
    
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, nn_input, hidden):
        batch_size = nn_input.size(0)

        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
  
        out = self.fc(lstm_out)
        out = out.view(batch_size, -1, self.output_size)
    
        out = out[:, -1]

        return out, hidden

    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    if(train_on_gpu):
        rnn.cuda()

    h = tuple([each.data for each in hidden])

    rnn.zero_grad()
    
    if(train_on_gpu):
        inputs, target = inp.cuda(), target.cuda()

   
    output, h = rnn(inputs, h)

    loss = criterion(output, target)

    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)

    optimizer.step()
    return loss.item(), h

In [11]:
def train_rnn(rnn, batch_size, optimizer, scheduler, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            # make sure you iterate over completely full batches, only
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []
                
        scheduler.step(loss)

    # returns a trained rnn
    return rnn

In [12]:
sequence_length =  15
batch_size = 128

train_loader = batch_data(int_text, sequence_length, batch_size)

num_epochs = 10
learning_rate = 0.001
vocab_size = len(vocab_to_int)
output_size = vocab_size
embedding_dim = 300
hidden_dim = 500
n_layers = 2

show_every_n_batches = 128

# Train Model

In [13]:
def save_model(filename, decoder):
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    torch.save(decoder, save_filename)

In [72]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1)
criterion = nn.CrossEntropyLoss()

trained_rnn = train_rnn(rnn, batch_size, optimizer, scheduler, criterion, num_epochs, show_every_n_batches)

save_model('./save/trained_rnn', trained_rnn)
print('Model Trained and Saved')

Training for 10 epoch(s)...
Epoch:    1/10    Loss: 7.160677425563335

Epoch:    1/10    Loss: 6.58905853331089

Epoch:    1/10    Loss: 6.423418145626783

Epoch:    1/10    Loss: 6.286110181361437

Epoch:    1/10    Loss: 6.133414402604103

Epoch:    1/10    Loss: 6.090179368853569

Epoch:    1/10    Loss: 6.0231430567801

Epoch:    1/10    Loss: 5.9000828713178635

Epoch:    1/10    Loss: 5.892559997737408

Epoch:    1/10    Loss: 5.895261742174625

Epoch:    1/10    Loss: 5.823074474930763

Epoch:    1/10    Loss: 5.747437451034784

Epoch:    1/10    Loss: 5.777952998876572

Epoch:    1/10    Loss: 5.765991888940334

Epoch:    1/10    Loss: 5.7267242930829525

Epoch:    1/10    Loss: 5.714062798768282

Epoch:    1/10    Loss: 5.673074822872877

Epoch:    1/10    Loss: 5.713571436703205

Epoch:    1/10    Loss: 5.670213121920824

Epoch:    1/10    Loss: 5.646732442080975

Epoch:    1/10    Loss: 5.637774582952261

Epoch:    1/10    Loss: 5.61912290379405

Epoch:    1/10    Loss: 5.62

Epoch:    3/10    Loss: 4.749410767108202

Epoch:    3/10    Loss: 4.753519654273987

Epoch:    3/10    Loss: 4.761327393352985

Epoch:    3/10    Loss: 4.788694716989994

Epoch:    3/10    Loss: 4.741136495023966

Epoch:    3/10    Loss: 4.765004076063633

Epoch:    3/10    Loss: 4.791958769783378

Epoch:    3/10    Loss: 4.760566491633654

Epoch:    3/10    Loss: 4.798851903527975

Epoch:    3/10    Loss: 4.783968899399042

Epoch:    3/10    Loss: 4.747497692704201

Epoch:    3/10    Loss: 4.789872378110886

Epoch:    3/10    Loss: 4.82056056894362

Epoch:    3/10    Loss: 4.7778035923838615

Epoch:    3/10    Loss: 4.787312772125006

Epoch:    3/10    Loss: 4.812111232429743

Epoch:    3/10    Loss: 4.770889487117529

Epoch:    3/10    Loss: 4.803953941911459

Epoch:    3/10    Loss: 4.796065153554082

Epoch:    3/10    Loss: 4.812932897359133

Epoch:    3/10    Loss: 4.782657060772181

Epoch:    3/10    Loss: 4.790665969252586

Epoch:    3/10    Loss: 4.827365882694721

Epoch:    3

Epoch:    5/10    Loss: 4.413279984146357

Epoch:    5/10    Loss: 4.508048012852669

Epoch:    5/10    Loss: 4.510725419968367

Epoch:    5/10    Loss: 4.460140805691481

Epoch:    5/10    Loss: 4.498106196522713

Epoch:    5/10    Loss: 4.49301235191524

Epoch:    5/10    Loss: 4.485750297084451

Epoch:    5/10    Loss: 4.488588135689497

Epoch:    5/10    Loss: 4.521628513932228

Epoch:    5/10    Loss: 4.5111196637153625

Epoch:    5/10    Loss: 4.489472884684801

Epoch:    5/10    Loss: 4.483102664351463

Epoch:    5/10    Loss: 4.5230230540037155

Epoch:    5/10    Loss: 4.499276105314493

Epoch:    5/10    Loss: 4.521032303571701

Epoch:    5/10    Loss: 4.528125170618296

Epoch:    5/10    Loss: 4.5122016947716475

Epoch:    5/10    Loss: 4.495102217420936

Epoch:    5/10    Loss: 4.534401625394821

Epoch:    5/10    Loss: 4.565460748970509

Epoch:    5/10    Loss: 4.504198657348752

Epoch:    5/10    Loss: 4.5241385493427515

Epoch:    5/10    Loss: 4.539779694750905

Epoch:  

Epoch:    7/10    Loss: 4.175411881878972

Epoch:    7/10    Loss: 4.203337583690882

Epoch:    7/10    Loss: 4.184699207544327

Epoch:    7/10    Loss: 4.208175951614976

Epoch:    7/10    Loss: 4.182371204718947

Epoch:    7/10    Loss: 4.182803416624665

Epoch:    7/10    Loss: 4.139114225283265

Epoch:    7/10    Loss: 4.175438638776541

Epoch:    7/10    Loss: 4.1294150948524475

Epoch:    7/10    Loss: 4.20180775411427

Epoch:    7/10    Loss: 4.197769373655319

Epoch:    7/10    Loss: 4.205236809328198

Epoch:    7/10    Loss: 4.211476178839803

Epoch:    7/10    Loss: 4.1576688550412655

Epoch:    7/10    Loss: 4.140850808471441

Epoch:    7/10    Loss: 4.186693279072642

Epoch:    7/10    Loss: 4.204785607755184

Epoch:    7/10    Loss: 4.217701710760593

Epoch:    7/10    Loss: 4.203047664836049

Epoch:    7/10    Loss: 4.214039519429207

Epoch:    7/10    Loss: 4.180067304521799

Epoch:    7/10    Loss: 4.178217390552163

Epoch:    7/10    Loss: 4.191901374608278

Epoch:    

Epoch:    9/10    Loss: 4.086771881207824

Epoch:    9/10    Loss: 4.078895820304751

Epoch:    9/10    Loss: 4.107445999979973

Epoch:    9/10    Loss: 4.086273076012731

Epoch:    9/10    Loss: 4.119758106768131

Epoch:    9/10    Loss: 4.116138853132725

Epoch:    9/10    Loss: 4.113968765363097

Epoch:    9/10    Loss: 4.105491017922759

Epoch:    9/10    Loss: 4.1793663538992405

Epoch:    9/10    Loss: 4.1355050932615995

Epoch:   10/10    Loss: 4.077266871929169

Epoch:   10/10    Loss: 4.058593090623617

Epoch:   10/10    Loss: 4.078632375225425

Epoch:   10/10    Loss: 4.066049695014954

Epoch:   10/10    Loss: 4.046324973925948

Epoch:   10/10    Loss: 4.0410935543477535

Epoch:   10/10    Loss: 4.070519428700209

Epoch:   10/10    Loss: 4.080893551930785

Epoch:   10/10    Loss: 4.067364927381277

Epoch:   10/10    Loss: 4.074931710958481

Epoch:   10/10    Loss: 4.0625165943056345

Epoch:   10/10    Loss: 4.105835657566786

Epoch:   10/10    Loss: 4.058726489543915

Epoch: 

# Load Here

In [14]:
def load_model(filename):
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    return torch.load(save_filename)

In [15]:
import torch

_, vocab_to_int, int_to_vocab, token_dict = load_preprocess()
trained_rnn = load_model('./save/trained_rnn')

# Generate Text

In [16]:
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    rnn.eval()
    
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        
        # initialize the hidden state
        hidden = rnn.init_hidden(current_seq.size(0))
        
        # get the output of the rnn
        output, _ = rnn(current_seq, hidden)
        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
         
        # use top_k sampling to get the index of the next word
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        # select the likely next word index with some element of randomness
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        
        # retrieve that word from the dictionary
        word = int_to_vocab[word_i]
        predicted.append(word)     
        
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq.cpu(), -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    
    # Replace punctuation tokens
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    
    # return all the sentences
    return gen_sentences

In [22]:
gen_length = 100 
prime_word = "the" 
sequence_length =  15
train_on_gpu = torch.cuda.is_available()

pad_word = '<PAD>'
generated_script = generate(trained_rnn, vocab_to_int[prime_word], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

the fear and courage of all morality.
the greatest discovery of life is that the future is the best remedy for it.
the world belongs to the energetic and the brunt of the mind.
the most difficult deception is to be a musician and a person of action.
i think it's a great deal that the world is the person with your own life.
the greatest discovery of the human mind is that it has been a limitation to the top of our lives.
the most important thing about
