In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [12]:
shakespeare_url = "https://homl.info.shakespeare"
filepath = keras.utils.get_file('shakespeare.txt',shakespeare_url)

In [40]:
with open(filepath) as f:
    shakespeare_text = f.read()

In [42]:
len(shakespeare_text)

1115394

In [43]:
shakespeare_text = shakespeare_text[:50000]

In [8]:
import spacy
nlp = spacy.load('en_core_web_lg', disable=['ner','tagger','parser'])

In [44]:
def separate_punct(d):
    return [token.text.lower() for token in nlp(d) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [45]:
nlp.max_length = 1115394

In [46]:
tokens = separate_punct(shakespeare_text)

In [47]:
tokens

['first',
 'citizen',
 'before',
 'we',
 'proceed',
 'any',
 'further',
 'hear',
 'me',
 'speak',
 'all',
 'speak',
 'speak',
 'first',
 'citizen',
 'you',
 'are',
 'all',
 'resolved',
 'rather',
 'to',
 'die',
 'than',
 'to',
 'famish',
 'all',
 'resolved',
 'resolved',
 'first',
 'citizen',
 'first',
 'you',
 'know',
 'caius',
 'marcius',
 'is',
 'chief',
 'enemy',
 'to',
 'the',
 'people',
 'all',
 'we',
 "know't",
 'we',
 "know't",
 'first',
 'citizen',
 'let',
 'us',
 'kill',
 'him',
 'and',
 'we',
 "'ll",
 'have',
 'corn',
 'at',
 'our',
 'own',
 'price',
 "is't",
 'a',
 'verdict',
 'all',
 'no',
 'more',
 'talking',
 "on't",
 'let',
 'it',
 'be',
 'done',
 'away',
 'away',
 'second',
 'citizen',
 'one',
 'word',
 'good',
 'citizens',
 'first',
 'citizen',
 'we',
 'are',
 'accounted',
 'poor',
 'citizens',
 'the',
 'patricians',
 'good',
 'what',
 'authority',
 'surfeits',
 'on',
 'would',
 'relieve',
 'us',
 'if',
 'they',
 'would',
 'yield',
 'us',
 'but',
 'the',
 'superfluity

In [48]:
# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [49]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [50]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [51]:
sequences[0]

[32,
 56,
 149,
 23,
 471,
 171,
 470,
 132,
 35,
 131,
 36,
 131,
 131,
 32,
 56,
 5,
 28,
 36,
 468,
 170,
 3,
 339,
 42,
 3,
 746,
 36]

In [52]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'i': 4,
 'you': 5,
 'of': 6,
 'a': 7,
 'that': 8,
 'in': 9,
 'he': 10,
 'not': 11,
 'marcius': 12,
 "'": 13,
 'for': 14,
 'your': 15,
 'him': 16,
 "'s": 17,
 'it': 18,
 'with': 19,
 'my': 20,
 'is': 21,
 'have': 22,
 'we': 23,
 'they': 24,
 'as': 25,
 'be': 26,
 'his': 27,
 'are': 28,
 'their': 29,
 'our': 30,
 'but': 31,
 'first': 32,
 'menenius': 33,
 'what': 34,
 'me': 35,
 'all': 36,
 'good': 37,
 'shall': 38,
 'this': 39,
 'will': 40,
 'if': 41,
 'than': 42,
 'no': 43,
 'o': 44,
 'cominius': 45,
 'at': 46,
 'us': 47,
 'well': 48,
 'so': 49,
 'them': 50,
 'do': 51,
 'more': 52,
 'would': 53,
 'which': 54,
 'on': 55,
 'citizen': 56,
 "'ll": 57,
 'make': 58,
 'aufidius': 59,
 'come': 60,
 'or': 61,
 'brutus': 62,
 'volumnia': 63,
 'were': 64,
 'where': 65,
 'by': 66,
 'upon': 67,
 'know': 68,
 'lartius': 69,
 'one': 70,
 'like': 71,
 'from': 72,
 'then': 73,
 'rome': 74,
 'let': 75,
 'can': 76,
 'was': 77,
 'when': 78,
 'how': 79,
 'must': 80,
 'sicin

In [53]:
tokenizer.word_counts

OrderedDict([('first', 1393),
             ('citizen', 719),
             ('before', 263),
             ('we', 1745),
             ('proceed', 57),
             ('any', 214),
             ('further', 59),
             ('hear', 294),
             ('me', 1309),
             ('speak', 295),
             ('all', 1303),
             ('you', 5164),
             ('are', 1525),
             ('resolved', 71),
             ('rather', 228),
             ('to', 5344),
             ('die', 100),
             ('than', 1021),
             ('famish', 51),
             ('know', 624),
             ('caius', 286),
             ('marcius', 2314),
             ('is', 1898),
             ('chief', 26),
             ('enemy', 130),
             ('the', 9606),
             ('people', 349),
             ("know't", 52),
             ('let', 572),
             ('us', 884),
             ('kill', 26),
             ('him', 2054),
             ('and', 5460),
             ("'ll", 702),
             ('have', 1768),
  

In [54]:
vocab_size = len(tokenizer.word_counts)

In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

In [56]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [57]:
sequences

[[32,
  56,
  149,
  23,
  471,
  171,
  470,
  132,
  35,
  131,
  36,
  131,
  131,
  32,
  56,
  5,
  28,
  36,
  468,
  170,
  3,
  339,
  42,
  3,
  746,
  36],
 [56,
  149,
  23,
  471,
  171,
  470,
  132,
  35,
  131,
  36,
  131,
  131,
  32,
  56,
  5,
  28,
  36,
  468,
  170,
  3,
  339,
  42,
  3,
  746,
  36,
  468],
 [149,
  23,
  471,
  171,
  470,
  132,
  35,
  131,
  36,
  131,
  131,
  32,
  56,
  5,
  28,
  36,
  468,
  170,
  3,
  339,
  42,
  3,
  746,
  36,
  468,
  468],
 [23,
  471,
  171,
  470,
  132,
  35,
  131,
  36,
  131,
  131,
  32,
  56,
  5,
  28,
  36,
  468,
  170,
  3,
  339,
  42,
  3,
  746,
  36,
  468,
  468,
  32],
 [471,
  171,
  470,
  132,
  35,
  131,
  36,
  131,
  131,
  32,
  56,
  5,
  28,
  36,
  468,
  170,
  3,
  339,
  42,
  3,
  746,
  36,
  468,
  468,
  32,
  56],
 [171,
  470,
  132,
  35,
  131,
  36,
  131,
  131,
  32,
  56,
  5,
  28,
  36,
  468,
  170,
  3,
  339,
  42,
  3,
  746,
  36,
  468,
  468,
  32,
  56,
  32],

In [58]:
sequences = np.array(sequences)

In [59]:
sequences[:,:-1]

array([[  32,   56,  149, ...,   42,    3,  746],
       [  56,  149,   23, ...,    3,  746,   36],
       [ 149,   23,  471, ...,  746,   36,  468],
       ...,
       [  54,    1,  170, ..., 2053,   50,   46],
       [   1,  170,   23, ...,   50,   46,   33],
       [ 170,   23,   38, ...,   46,   33,    8]])

In [60]:
sequences[:,-1]

array([ 36, 468, 468, ...,  33,   8,  17])

In [61]:
X = sequences[:,:-1]

In [62]:
Y = sequences[:,-1]

In [63]:
Y = keras.utils.to_categorical(Y, num_classes=vocab_size+1)

In [69]:
Y.shape

(9182, 2054)

In [70]:
Y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [64]:
seq_len = X.shape[1]

In [65]:
seq_len

25

In [71]:
model = create_model(vocab_size+1, seq_len)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            51350     
_________________________________________________________________
lstm_2 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_3 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_2 (Dense)              (None, 150)               22650     
_________________________________________________________________
dense_3 (Dense)              (None, 2054)              310154    
Total params: 670,354
Trainable params: 670,354
Non-trainable params: 0
_________________________________________________________________


In [73]:
model.fit(X, Y, epochs = 50, verbose=1)

Train on 9182 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1b145c19848>

In [76]:
from pickle import load, dump

In [77]:
model.save('Shakespeare.h5')

In [78]:
dump(tokenizer, open('shakespeare_tokenizer','wb'))