**Text Generation with Neural Networks**

**Functions for Processing Text**

Reading in files as a string text

In [54]:
def read_file(file):
  with open(file) as f:
    str_text = f.read()

    return str_text

**Tokenize and Clean Text**

In [55]:
import spacy

In [56]:
nlp = spacy.load("en",disable=["parser","tagger","ner"])

In [57]:
nlp.max_length = 1198623

In [58]:
def seperate_punc(doc_text):
  return[token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [59]:
d = read_file("moby_dick_four_chapters.txt")

In [60]:
tokens = seperate_punc(d)

In [61]:
len(tokens)

11338

**Create Sequences of Tokens**

In [62]:
#25 words --- network predict #26 word

In [63]:
# organize into sequences of tokens
train_length = 25 + 1 # 25 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_length,len(tokens)):
  seq = tokens[i-train_length:i]  # Grab train_len# amount of characters

   # Add to list of sequences
  text_sequences.append(seq)
  

In [64]:
" ".join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [65]:
" ".join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

**Keras Tokenization**

In [66]:
from keras.preprocessing.text import Tokenizer

In [67]:
# integer encode sequences of words
tokenizer = Tokenizer()

tokenizer.fit_on_texts(text_sequences)

In [68]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [69]:
sequences[0]

[956,
 14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 315,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2712,
 14,
 24]

In [70]:
tokenizer.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'i',
 6: 'to',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'he',
 11: 'his',
 12: 'was',
 13: 'but',
 14: 'me',
 15: 'with',
 16: 'as',
 17: 'at',
 18: 'this',
 19: 'you',
 20: 'is',
 21: 'all',
 22: 'for',
 23: 'my',
 24: 'on',
 25: 'be',
 26: "'s",
 27: 'not',
 28: 'from',
 29: 'there',
 30: 'one',
 31: 'up',
 32: 'what',
 33: 'him',
 34: 'so',
 35: 'bed',
 36: 'now',
 37: 'about',
 38: 'no',
 39: 'into',
 40: 'by',
 41: 'were',
 42: 'out',
 43: 'or',
 44: 'harpooneer',
 45: 'had',
 46: 'then',
 47: 'have',
 48: 'an',
 49: 'upon',
 50: 'little',
 51: 'some',
 52: 'old',
 53: 'like',
 54: 'if',
 55: 'they',
 56: 'would',
 57: 'do',
 58: 'over',
 59: 'landlord',
 60: 'thought',
 61: 'room',
 62: 'when',
 63: 'could',
 64: "n't",
 65: 'night',
 66: 'here',
 67: 'head',
 68: 'such',
 69: 'which',
 70: 'man',
 71: 'did',
 72: 'sea',
 73: 'time',
 74: 'other',
 75: 'very',
 76: 'go',
 77: 'these',
 78: 'more',
 79: 'though',
 80: 'first',
 81: 'sort',


In [71]:
for i in sequences[0]:
  print(f"{i} : {tokenizer.index_word[i]}")

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
315 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2712 : interest
14 : me
24 : on


In [72]:
#Frequency of words
tokenizer.word_counts

OrderedDict([('call', 27),
             ('me', 2471),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('never', 449),
             ('mind', 164),
             ('how', 321),
             ('long', 374),
             ('precisely', 37),
             ('having', 142),
             ('little', 767),
             ('or', 950),
             ('no', 1003),
             ('money', 120),
             ('in', 5647),
             ('my', 1786),
             ('purse', 71),
             ('and', 9646),
             ('nothing', 281),
             ('particular', 152),
             ('to', 6497),
             ('interest', 24),
             ('on', 1716),
             ('shore', 26),
             ('i', 7150),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 416),
             ('the', 15540),
             ('watery', 26),
  

In [73]:
vocabulary_size = len(tokenizer.word_counts)

In [74]:
vocabulary_size

2717

In [75]:
type(sequences)

list

Convert to Numpy Matrix

In [76]:
import numpy as np

In [77]:
sequences = np.array(sequences)

In [78]:
sequences[0]

array([ 956,   14,  263,   51,  261,  408,   87,  219,  129,  111,  954,
        260,   50,   43,   38,  315,    7,   23,  546,    3,  150,  259,
          6, 2712,   14,   24])

Train / Test Split

In [79]:
from keras.utils import to_categorical

In [80]:
#Get the sequence except last one
x = sequences[:,:-1]

In [81]:
#Get only last index value
y = sequences[:,-1]

In [82]:
#We will be having vocabulary number of unique values, so we can create that many number of categories
y = to_categorical(y,num_classes=vocabulary_size+1)

In [83]:
x.shape

(11312, 25)

In [84]:
seq_len = x.shape[1]

Creating an LSTM based model

In [85]:
from keras.models import  Sequential
from keras.layers import Dense,LSTM,Embedding

In [86]:
def create_model(vocabulary_size,seq_len):

  model = Sequential()

  model.add(Embedding(vocabulary_size,seq_len, input_length = seq_len))
  model.add(LSTM(50,return_sequences=True))
  model.add(LSTM(50))
  model.add(Dense(50,activation="relu"))
  model.add(Dense(vocabulary_size,activation="softmax"))

  model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

  model.summary()

  return model

Training the Model

In [87]:
# define model
model = create_model(vocabulary_size+1,seq_len)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            67950     
_________________________________________________________________
lstm_2 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_3 (Dense)              (None, 2718)              138618    
Total params: 244,518
Trainable params: 244,518
Non-trainable params: 0
_________________________________________________________________


In [88]:
# fit model
model.fit(x,y,batch_size=128,epochs=200,verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fc09bca1198>

**Generating New Text**

In [89]:
from keras.preprocessing.sequence import pad_sequences

In [90]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):

  # Final Output
  output_text = []

  # Intial Seed Sequence
  input_text = seed_text

  # Create num_gen_words
  for i in range(num_gen_words):

    # Take the input text string and encode it to a sequence
    encoded_text = tokenizer.texts_to_sequences([input_text])[0]

    # Pad sequences to our trained rate (25 words in the video)
    pad_encoded = pad_sequences([encoded_text],maxlen=seq_len,truncating="pre")

    # Predict Class Probabilities for each word
    pre_word_ind = model.predict_classes(pad_encoded,verbose=0)[0]

    # Grab word
    pred_word = tokenizer.index_word[pre_word_ind]
    
     # Update the sequence of input text (shifting one over with the new word)
    input_text += " " + pred_word

    output_text.append(pred_word)

   # Make it look like a sentence.
  return " ".join(output_text)

In [91]:
text_sequences

[['call',
  'me',
  'ishmael',
  'some',
  'years',
  'ago',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
  'particular',
  'to',
  'interest',
  'me',
  'on'],
 ['me',
  'ishmael',
  'some',
  'years',
  'ago',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
  'particular',
  'to',
  'interest',
  'me',
  'on',
  'shore'],
 ['ishmael',
  'some',
  'years',
  'ago',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
  'particular',
  'to',
  'interest',
  'me',
  'on',
  'shore',
  'i'],
 ['some',
  'years',
  'ago',
  'never',
  'mind',
  'how',
  'long',
  'precisely',
  'having',
  'little',
  'or',
  'no',
  'money',
  'in',
  'my',
  'purse',
  'and',
  'nothing',
 

In [92]:
seed_text = " ".join(text_sequences[0])

In [93]:
seed_text

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [94]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'shore i thought you may accustomed to understand a whalin and had a good frosty principle in doom was beating a small degree civilized by'