# Text Generation with Neural Networks

## Functions for Processing Text

### Reading in files as a string text

In [1]:
def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [2]:
doc =read_file('game_of_thrones.txt')

In [3]:
len(doc)

40198

### Tokenize and Clean Text

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])

In [6]:
nlp.max_length = 40199

In [7]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [8]:
d = read_file('game_of_thrones.txt')
tokens = separate_punc(d)



In [9]:
tokens

['a',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'a',
 'game',
 'of',
 'thrones',
 'prologue',
 'we',
 'should',
 'start',
 'back',
 'gared',
 'urged',
 'as',
 'the',
 'woods',
 'began',
 'to',
 'grow',
 'dark',
 'around',
 'them',
 'the',
 'wildlings',
 'are',
 'dead',
 'do',
 'the',
 'dead',
 'frighten',
 'you',
 'ser',
 'waymar',
 'royce',
 'asked',
 'with',
 'just',
 'the',
 'hint',
 'of',
 'a',
 'smile',
 'gared',
 'did',
 'not',
 'rise',
 'to',
 'the',
 'bait',
 'he',
 'was',
 'an',
 'old',
 'man',
 'past',
 'fifty',
 'and',
 'he',
 'had',
 'seen',
 'the',
 'lordlings',
 'come',
 'and',
 'go',
 'dead',
 'is',
 'dead',
 'he',
 'said',
 'we',
 'have',
 'no',
 'business',
 'with',
 'the',
 'dead',
 'are',
 'they',
 'dead',
 'royce',
 'asked',
 'softly',
 'what',
 'proof',
 'have',
 'we',
 'will',
 'saw',
 'them',
 'gared',
 'said',
 'if',
 'he',
 'says',
 'they',
 'are',
 'dead',
 'that',
 "'s",
 'proof',
 'enough',
 'for',
 'me',
 'will',
 'had',
 'known',
 'they',
 'would',
 'drag'

In [10]:
len(tokens)

7097

## Create Sequences of Tokens

In [11]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [12]:
text_sequences[1]

['song',
 'of',
 'ice',
 'and',
 'fire',
 'a',
 'game',
 'of',
 'thrones',
 'prologue',
 'we',
 'should',
 'start',
 'back',
 'gared',
 'urged',
 'as',
 'the',
 'woods',
 'began',
 'to',
 'grow',
 'dark',
 'around',
 'them',
 'the']

In [13]:
' '.join(text_sequences[0])

'a song of ice and fire a game of thrones prologue we should start back gared urged as the woods began to grow dark around them'

In [14]:
' '.join(text_sequences[1])

'song of ice and fire a game of thrones prologue we should start back gared urged as the woods began to grow dark around them the'

In [15]:
' '.join(text_sequences[2])

'of ice and fire a game of thrones prologue we should start back gared urged as the woods began to grow dark around them the wildlings'

In [16]:
len(text_sequences)

7071

# Keras

### Keras Tokenization

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [18]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

So what we've done here is we've essentially replaced

our original text sequences, which are just sequences of text 26-long

to sequences that happen to have numbers in place of those words.

And now the idea is that each of these numbers is an ID for a particular word.

And if you ever wanna figure out the relationship you simply call tokenizer.index word.

In [19]:
sequences[0]

[4,
 1867,
 2,
 213,
 3,
 77,
 4,
 1865,
 2,
 1864,
 1862,
 43,
 465,
 1860,
 107,
 25,
 1859,
 20,
 1,
 348,
 347,
 6,
 1858,
 346,
 124,
 49]

In [20]:
tokenizer.index_word

{1: 'the',
 2: 'of',
 3: 'and',
 4: 'a',
 5: 'his',
 6: 'to',
 7: 'he',
 8: 'it',
 9: 'was',
 10: 'ser',
 11: 'will',
 12: 'in',
 13: "'s",
 14: 'had',
 15: 'their',
 16: 'lord',
 17: 'on',
 18: 'son',
 19: 'king',
 20: 'as',
 21: 'you',
 22: 'with',
 23: 'that',
 24: 'aegon',
 25: 'gared',
 26: 'from',
 27: 'i',
 28: 'royce',
 29: 'for',
 30: 'are',
 31: 'not',
 32: 'house',
 33: 'him',
 34: 'at',
 35: 'waymar',
 36: 'no',
 37: 'they',
 38: 'there',
 39: 'by',
 40: 'black',
 41: 'cold',
 42: 'called',
 43: 'we',
 44: 'is',
 45: 'like',
 46: 'lady',
 47: 'when',
 48: 'wife',
 49: 'them',
 50: 'prince',
 51: 'been',
 52: 'all',
 53: 'said',
 54: 'have',
 55: 'but',
 56: 'up',
 57: 'did',
 58: 'could',
 59: 'then',
 60: 'over',
 61: 'its',
 62: 'sword',
 63: 'daughter',
 64: 'dead',
 65: 'an',
 66: 'old',
 67: 'man',
 68: 'out',
 69: 'years',
 70: 'one',
 71: 'brother',
 72: 'snow',
 73: 'through',
 74: 'words',
 75: 'eldest',
 76: 'targaryen',
 77: 'fire',
 78: 'children',
 79: 'iron',


In [21]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

4 : a
1867 : song
2 : of
213 : ice
3 : and
77 : fire
4 : a
1865 : game
2 : of
1864 : thrones
1862 : prologue
43 : we
465 : should
1860 : start
107 : back
25 : gared
1859 : urged
20 : as
1 : the
348 : woods
347 : began
6 : to
1858 : grow
346 : dark
124 : around
49 : them


In [22]:
tokenizer.word_counts

OrderedDict([('a', 4562),
             ('song', 2),
             ('of', 6448),
             ('ice', 134),
             ('and', 4659),
             ('fire', 318),
             ('game', 8),
             ('thrones', 10),
             ('prologue', 11),
             ('we', 532),
             ('should', 65),
             ('start', 14),
             ('back', 249),
             ('gared', 822),
             ('urged', 17),
             ('as', 980),
             ('the', 13122),
             ('woods', 98),
             ('began', 99),
             ('to', 3636),
             ('grow', 23),
             ('dark', 102),
             ('around', 233),
             ('them', 442),
             ('wildlings', 26),
             ('are', 728),
             ('dead', 338),
             ('do', 156),
             ('frighten', 26),
             ('you', 936),
             ('ser', 1820),
             ('waymar', 624),
             ('royce', 754),
             ('asked', 156),
             ('with', 936),
             ('ju

In [23]:
vocabulary_size = len(tokenizer.word_counts)

In [24]:
vocabulary_size

1868

### Convert to Numpy Matrix

In [25]:
import numpy as np

In [26]:
sequences = np.array(sequences)

In [27]:
sequences

array([[   4, 1867,    2, ...,  346,  124,   49],
       [1867,    2,  213, ...,  124,   49,    1],
       [   2,  213,    3, ...,   49,    1,  767],
       ...,
       [  50,  211, 1857, ..., 1866,    4,  466],
       [ 211, 1857,  267, ...,    4,  466,    2],
       [1857,  267,  211, ...,  466,    2, 1868]])

# Creating an LSTM based model

So for this lecture,

we're going to show you how to create the LSTM based model

but before we do that,

we want to split the data into features and labels.

So we're going to have our X features

which is the first N words of the sequence

and then we'll have our Y label

which is the very next word after that entire sequence.

Once we've done that,

we'll be able to fit the model on that feature data

in order to predict the next word in the sequence.

### Train / Test Split

In [28]:
from keras.utils import to_categorical

In [29]:
sequences

array([[   4, 1867,    2, ...,  346,  124,   49],
       [1867,    2,  213, ...,  124,   49,    1],
       [   2,  213,    3, ...,   49,    1,  767],
       ...,
       [  50,  211, 1857, ..., 1866,    4,  466],
       [ 211, 1857,  267, ...,    4,  466,    2],
       [1857,  267,  211, ...,  466,    2, 1868]])

In [30]:
# First 49 words
sequences[:,:-1]

array([[   4, 1867,    2, ..., 1858,  346,  124],
       [1867,    2,  213, ...,  346,  124,   49],
       [   2,  213,    3, ...,  124,   49,    1],
       ...,
       [  50,  211, 1857, ..., 1861, 1866,    4],
       [ 211, 1857,  267, ..., 1866,    4,  466],
       [1857,  267,  211, ...,    4,  466,    2]])

In [31]:
# last Word
sequences[:,-1]

array([  49,    1,  767, ...,  466,    2, 1868])

In [32]:
X = sequences[:,:-1]

In [33]:
y = sequences[:,-1]

In [34]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [35]:
seq_len = X.shape[1]

In [36]:
seq_len

25

### Training the Model

In [37]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

from keras.models import sequential

and then from keras.layers import

and we'll be importing a dense layer

and LSTM layer to deal with the sequences

and an embedding layer to deal with the vocabulary

In [38]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25))  # Remove input_length argument
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Build the model to calculate output shapes and parameter counts
    model.build((None, seq_len))  # Pass input shape (batch_size, seq_len)
    # Print the summary
    model.summary()
    return model

In [39]:
# define model
model = create_model(vocabulary_size+1, seq_len)

---

----

but if you saw the summary, then you are good to go.

Okay, so coming up next we're going to generate new text.

So the last thing to do here

is actually train and fit our model.

So what we do is we're going to say,

from pickle import dump, load.

That's going to allow us to actually save the file

and then load it up later on.

In [40]:
from pickle import dump,load

In [42]:
# fit model
model.fit(X, y, batch_size=128, epochs=300,verbose=1)

Epoch 1/300
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 195ms/step - accuracy: 0.0715 - loss: 6.0783
Epoch 2/300
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 180ms/step - accuracy: 0.0699 - loss: 6.0701
Epoch 3/300
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 182ms/step - accuracy: 0.0768 - loss: 5.8589
Epoch 4/300
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 182ms/step - accuracy: 0.0804 - loss: 5.7745
Epoch 5/300
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 186ms/step - accuracy: 0.0816 - loss: 5.7443
Epoch 6/300
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 184ms/step - accuracy: 0.0810 - loss: 5.6422
Epoch 7/300
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 185ms/step - accuracy: 0.0848 - loss: 5.5415
Epoch 8/300
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 182ms/step - accuracy: 0.0824 - loss: 5.4558
Epoch 9/300
[1m56/56[0

<keras.src.callbacks.history.History at 0x1d4ea6d28d0>

In [43]:
# save the model to file
model.save('game_of_thrones.txt.keras')
# save the tokenizer
dump(tokenizer, open('game_of_thrones.txt', 'wb'))

# Generating New Text

In [44]:
from random import randint
from pickle import load

In [45]:
from keras.models import load_model

In [46]:
#from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.sequence import pad_sequences

In [47]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Initial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate 
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_probabilities = model.predict(pad_encoded, verbose=0)[0]
        
        # Sample the index of the next word based on predicted probabilities
        next_word_index = np.random.choice(len(pred_probabilities), p=pred_probabilities)
        
        # Convert index to word
        pred_word = tokenizer.index_word[next_word_index + 1]  # Adjust index by 1
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)


### Grab a random seed sequence

In [48]:
text_sequences[0]

['a',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'a',
 'game',
 'of',
 'thrones',
 'prologue',
 'we',
 'should',
 'start',
 'back',
 'gared',
 'urged',
 'as',
 'the',
 'woods',
 'began',
 'to',
 'grow',
 'dark',
 'around',
 'them']

In [49]:
import random

In [54]:
random.seed(101)
random_pick_1 = random.randint(0,len(text_sequences))

In [55]:
random.seed(102)
random_pick_2 = random.randint(0, len(text_sequences))

In [56]:
random.seed(103)
random_pick_3 = random.randint(0, len(text_sequences))

In [57]:
random_seed_text_1 = text_sequences[random_pick_1]

In [58]:
random_seed_text_2 = text_sequences[random_pick_2]

In [59]:
random_seed_text_3 = text_sequences[random_pick_3]

In [60]:
random_seed_text_1

['their',
 'son',
 'tyrek',
 'squire',
 'to',
 'the',
 'king',
 'gerion',
 'his',
 'youngest',
 'brother',
 'lost',
 'at',
 'sea',
 'his',
 'bastard',
 'daughter',
 'joy',
 'a',
 'girl',
 'of',
 'ten',
 'their',
 'cousin',
 'ser',
 'stafford']

In [61]:
random_seed_text_2

['will',
 'and',
 'at',
 'first',
 'you',
 'shiver',
 'and',
 'your',
 'teeth',
 'chatter',
 'and',
 'you',
 'stamp',
 'your',
 'feet',
 'and',
 'dream',
 'of',
 'mulled',
 'wine',
 'and',
 'nice',
 'hot',
 'fires',
 'it',
 'burns']

In [62]:
random_seed_text_3

['islands',
 'called',
 'ironmen',
 'by',
 'those',
 'they',
 'plundered',
 'were',
 'the',
 'terrors',
 'of',
 'the',
 'seas',
 'sailing',
 'as',
 'far',
 'as',
 'the',
 'port',
 'of',
 'ibben',
 'and',
 'the',
 'summer',
 'isles',
 'they']

In [63]:
seed_text_1 = ' '.join(random_seed_text_1)

In [64]:
seed_text_2 = ' '.join(random_seed_text_2)

In [65]:
seed_text_3 = ' '.join(random_seed_text_3)

In [66]:
seed_text_1

'their son tyrek squire to the king gerion his youngest brother lost at sea his bastard daughter joy a girl of ten their cousin ser stafford'

In [67]:
seed_text_2

'will and at first you shiver and your teeth chatter and you stamp your feet and dream of mulled wine and nice hot fires it burns'

In [68]:
seed_text_3

'islands called ironmen by those they plundered were the terrors of the seas sailing as far as the port of ibben and the summer isles they'

In [70]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text_1,num_gen_words=25)

'master snow age he of of of had rewarded had longer of siblings of of of of came had royce houses king it double tightness'

In [71]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text_2,num_gen_words=25)

'was every observed a wine fell son lord of of of sigil left royce ser king a no white half dance of of of of'

In [72]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text_3,num_gen_words=25)

'fierceness to gods ended and fade a dead to is to interrupt like and in of of at of son and escaped of of at'