<a href="https://colab.research.google.com/github/ReynaQuita/nmt/blob/master/01_Text_Generation_with_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

___

<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>
___
# Text Generation with Neural Networks

## Functions for Processing Text

### Reading in files as a string text

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [3]:
read_file('/content/gdrive/MyDrive/datasets/moby_dick_four_chapters.txt')

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circulation.  Whenever I\nfind myself growing grim about the mouth; whenever it is a damp,\ndrizzly November in my soul; whenever I find myself involuntarily\npausing before coffin warehouses, and bringing up the rear of every\nfuneral I meet; and especially whenever my hypos get such an upper\nhand of me, that it requires a strong moral principle to prevent me\nfrom deliberately stepping into the street, and methodically knocking\npeople\'s hats off--then, I account it high time to get to sea as soon\nas I can.  This is my substitute for pistol and ball.  With a\nphilosophical flourish Cato throws himself upon his sword; I quietly\ntake to the ship.  There is nothing surprising in this.  If t

### Tokenize and Clean Text

In [4]:
import spacy
nlp = spacy.load('en_core_web_sm',disable=['parser', 'tagger','ner'])

nlp.max_length = 1198623

In [5]:
# nlp.max_length

In [6]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [7]:
d = read_file('/content/gdrive/MyDrive/datasets/robert_frost.txt')
tokens = separate_punc(d)

In [8]:
tokens

['two',
 'roads',
 'diverged',
 'in',
 'a',
 'yellow',
 'wood',
 'and',
 'sorry',
 'i',
 'could',
 'not',
 'travel',
 'both',
 'and',
 'be',
 'one',
 'traveler',
 'long',
 'i',
 'stood',
 'and',
 'looked',
 'down',
 'one',
 'as',
 'far',
 'as',
 'i',
 'could',
 'to',
 'where',
 'it',
 'bent',
 'in',
 'the',
 'undergrowth',
 'then',
 'took',
 'the',
 'other',
 'as',
 'just',
 'as',
 'fair',
 'and',
 'having',
 'perhaps',
 'the',
 'better',
 'claim',
 'because',
 'it',
 'was',
 'grassy',
 'and',
 'wanted',
 'wear',
 'though',
 'as',
 'for',
 'that',
 'the',
 'passing',
 'there',
 'had',
 'worn',
 'them',
 'really',
 'about',
 'the',
 'same',
 'and',
 'both',
 'that',
 'morning',
 'equally',
 'lay',
 'in',
 'leaves',
 'no',
 'step',
 'had',
 'trodden',
 'black',
 'oh',
 'i',
 'kept',
 'the',
 'first',
 'for',
 'another',
 'day',
 'yet',
 'knowing',
 'how',
 'way',
 'leads',
 'on',
 'to',
 'way',
 'i',
 'doubted',
 'if',
 'i',
 'should',
 'ever',
 'come',
 'back',
 'i',
 'shall',
 'be',
 '

In [9]:
len(tokens)

11588

In [10]:
# 4431/25

## Create Sequences of Tokens

In [11]:
# organize into sequences of tokens
train_len = 25+1 # 25 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [12]:
' '.join(text_sequences[0])

'two roads diverged in a yellow wood and sorry i could not travel both and be one traveler long i stood and looked down one as'

In [13]:
' '.join(text_sequences[1])

'roads diverged in a yellow wood and sorry i could not travel both and be one traveler long i stood and looked down one as far'

In [14]:
' '.join(text_sequences[2])

'diverged in a yellow wood and sorry i could not travel both and be one traveler long i stood and looked down one as far as'

In [15]:
len(text_sequences)

11562

# Keras

### Keras Tokenization

In [16]:
from keras.preprocessing.text import Tokenizer

In [17]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [18]:
len(sequences)

11562

In [19]:
sequences[0]

[104,
 868,
 867,
 11,
 6,
 865,
 864,
 5,
 863,
 3,
 67,
 34,
 2152,
 164,
 5,
 32,
 27,
 2151,
 195,
 3,
 285,
 5,
 223,
 64,
 27,
 19]

In [20]:
tokenizer.index_word

{1: 'the',
 2: "'",
 3: 'i',
 4: 'to',
 5: 'and',
 6: 'a',
 7: 'of',
 8: 'it',
 9: "'s",
 10: 'you',
 11: 'in',
 12: 'he',
 13: "n't",
 14: 'that',
 15: 'was',
 16: 'for',
 17: 'but',
 18: 'me',
 19: 'as',
 20: 'on',
 21: 'what',
 22: 'with',
 23: 'they',
 24: 'do',
 25: 'we',
 26: 'all',
 27: 'one',
 28: 'is',
 29: 'had',
 30: 'she',
 31: 'have',
 32: 'be',
 33: 'if',
 34: 'not',
 35: 'his',
 36: 'there',
 37: 'out',
 38: 'him',
 39: 'up',
 40: 'see',
 41: 'from',
 42: 'no',
 43: 'her',
 44: 'like',
 45: 'where',
 46: 'them',
 47: 'so',
 48: 'or',
 49: 'at',
 50: 'by',
 51: 'my',
 52: 'when',
 53: 'this',
 54: 'were',
 55: 'know',
 56: 'some',
 57: 'are',
 58: 'say',
 59: 'here',
 60: 'too',
 61: 'said',
 62: 'can',
 63: "'d",
 64: 'down',
 65: 'been',
 66: 'off',
 67: 'could',
 68: 'then',
 69: 'must',
 70: 'tell',
 71: 'old',
 72: 'would',
 73: 'now',
 74: 'let',
 75: 'way',
 76: "'ll",
 77: 'come',
 78: 'door',
 79: 'make',
 80: 'think',
 81: 'will',
 82: 'your',
 83: 'an',
 84: 'j

In [21]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

104 : two
868 : roads
867 : diverged
11 : in
6 : a
865 : yellow
864 : wood
5 : and
863 : sorry
3 : i
67 : could
34 : not
2152 : travel
164 : both
5 : and
32 : be
27 : one
2151 : traveler
195 : long
3 : i
285 : stood
5 : and
223 : looked
64 : down
27 : one
19 : as


In [22]:
tokenizer.word_counts

OrderedDict([('two', 464),
             ('roads', 28),
             ('diverged', 29),
             ('in', 4294),
             ('a', 6297),
             ('yellow', 32),
             ('wood', 33),
             ('and', 7586),
             ('sorry', 35),
             ('i', 8610),
             ('could', 713),
             ('not', 1468),
             ('travel', 13),
             ('both', 274),
             ('be', 1550),
             ('one', 1732),
             ('traveler', 18),
             ('long', 227),
             ('stood', 151),
             ('looked', 205),
             ('down', 752),
             ('as', 2028),
             ('far', 312),
             ('to', 8151),
             ('where', 1144),
             ('it', 5416),
             ('bent', 26),
             ('the', 14222),
             ('undergrowth', 26),
             ('then', 676),
             ('took', 182),
             ('other', 312),
             ('just', 546),
             ('fair', 78),
             ('having', 260),
          

In [23]:
vocabulary_size = len(tokenizer.word_counts)

In [24]:
vocabulary_size

2153

### Convert to Numpy Matrix

In [25]:
import numpy as np

In [26]:
sequences = np.array(sequences)

In [27]:
sequences

array([[ 104,  868,  867, ...,   64,   27,   19],
       [ 868,  867,   11, ...,   27,   19,  141],
       [ 867,   11,    6, ...,   19,  141,   19],
       ...,
       [ 545,   53,  561, ...,    9,  866,    4],
       [  53,  561,  111, ...,  866,    4, 2153],
       [ 561,  111,   28, ...,    4, 2153,    5]])

In [28]:
sequences.shape

(11562, 26)

# Creating an LSTM based model

In [29]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [30]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

### Train / Test Split

In [31]:
from keras.utils import to_categorical

In [32]:
sequences

array([[ 104,  868,  867, ...,   64,   27,   19],
       [ 868,  867,   11, ...,   27,   19,  141],
       [ 867,   11,    6, ...,   19,  141,   19],
       ...,
       [ 545,   53,  561, ...,    9,  866,    4],
       [  53,  561,  111, ...,  866,    4, 2153],
       [ 561,  111,   28, ...,    4, 2153,    5]])

In [33]:
# First 25 words
sequences[:,:-1]

array([[ 104,  868,  867, ...,  223,   64,   27],
       [ 868,  867,   11, ...,   64,   27,   19],
       [ 867,   11,    6, ...,   27,   19,  141],
       ...,
       [ 545,   53,  561, ...,  176,    9,  866],
       [  53,  561,  111, ...,    9,  866,    4],
       [ 561,  111,   28, ...,  866,    4, 2153]])

In [34]:
# last Word
sequences[:,-1]

array([  19,  141,   19, ...,    4, 2153,    5])

In [35]:
X = sequences[:,:-1]

In [36]:
y = sequences[:,-1]

In [37]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [38]:
seq_len = X.shape[1]

In [39]:
seq_len

25

### Training the Model

In [40]:
# define model
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            53850     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 150)           105600    
_________________________________________________________________
lstm_1 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense (Dense)                (None, 150)               22650     
_________________________________________________________________
dense_1 (Dense)              (None, 2154)              325254    
Total params: 687,954
Trainable params: 687,954
Non-trainable params: 0
_________________________________________________________________


---

----

In [41]:
from pickle import dump,load

In [42]:
# fit model
model.fit(X, y, batch_size=128, epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7efb86140690>

In [43]:
# save the model to file
model.save('epochBIG.h5')
# save the tokenizer
dump(tokenizer, open('epochBIG', 'wb'))

# Generating New Text

In [44]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [45]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

### Grab a random seed sequence

In [46]:
text_sequences[0]

['two',
 'roads',
 'diverged',
 'in',
 'a',
 'yellow',
 'wood',
 'and',
 'sorry',
 'i',
 'could',
 'not',
 'travel',
 'both',
 'and',
 'be',
 'one',
 'traveler',
 'long',
 'i',
 'stood',
 'and',
 'looked',
 'down',
 'one',
 'as']

In [47]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [48]:
random_seed_text = text_sequences[random_pick]

In [49]:
random_seed_text

['bless',
 'you',
 'i',
 "'m",
 'her',
 'mother-',
 'i',
 'ca',
 "n't",
 'talk',
 'to',
 'her',
 'and',
 'lord',
 'if',
 'i',
 'could',
 "'",
 "'",
 'it',
 'will',
 'go',
 'hard',
 'with',
 'john',
 'what']

In [50]:
seed_text = ' '.join(random_seed_text)

In [51]:
seed_text

"bless you i 'm her mother- i ca n't talk to her and lord if i could ' ' it will go hard with john what"

In [52]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)



'the cellar door and the cellar door and the cellar door and the cellar door and the cellar door and the cellar door and the cellar door and the cellar door and the cellar door and the cellar door and the cellar door and the cellar door and the cellar'

# Great Job!