# Steps
1. Load the libraries and data
2. Clean the data
3. Tokenize the data
4. Convert to sequence
5. Input sequence and output sequence
6. Create a sequential model
7. LSTM layer
8. Compile the model
9. Fit the model
10. Evaluate the model

Import the libraries

In [None]:
from random import randint
from pickle import load,dump
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding,GRU,Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
import string
import urllib

Load the url

In [None]:
response = urllib.request.urlopen('https://raw.githubusercontent.com/insaid2018/DeepLearning/master/Data/republic_clean.txt')
doc = response.read().decode('utf8')

In [None]:
doc[:1000]
# r r n [line breaks and new lines]

'\ufeff\rBOOK I.\r\r\n\r\r\nI went down yesterday to the Piraeus with Glaucon the son of Ariston,\r\r\nthat I might offer up my prayers to the goddess (Bendis, the Thracian\r\r\nArtemis.); and also because I wanted to see in what manner they would\r\r\ncelebrate the festival, which was a new thing. I was delighted with the\r\r\nprocession of the inhabitants; but that of the Thracians was equally,\r\r\nif not more, beautiful. When we had finished our prayers and viewed the\r\r\nspectacle, we turned in the direction of the city; and at that instant\r\r\nPolemarchus the son of Cephalus chanced to catch sight of us from a\r\r\ndistance as we were starting on our way home, and told his servant to\r\r\nrun and bid us wait for him. The servant took hold of me by the cloak\r\r\nbehind, and said: Polemarchus desires you to wait.\r\r\n\r\r\nI turned round, and asked him where his master was.\r\r\n\r\r\nThere he is, said the youth, coming after you, if you will only wait.\r\r\n\r\r\nCertainly we 

In [None]:
# Now it comes line by line
print(doc[:1000])


﻿BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what manner they would
celebrate the festival, which was a new thing. I was delighted with the
procession of the inhabitants; but that of the Thracians was equally,
if not more, beautiful. When we had finished our prayers and viewed the
spectacle, we turned in the direction of the city; and at that instant
Polemarchus the son of Cephalus chanced to catch sight of us from a
distance as we were starting on our way home, and told his servant to
run and bid us wait for him. The servant took hold of me by the cloak
behind, and said: Polemarchus desires you to wait.

I turned round, and asked him where his master was.

There he is, said the youth, coming after you, if you will only wait.

Certainly we will, said Glaucon; and in a few minutes Polemarchus
appear

Clean the data

In [None]:
# turn the doc into clean tokens
def clean_doc(doc):
  # replace '_ _' with space
  doc = doc.replace('--',' ')
  # split into tokens by white space
  tokens = doc.split()
  # remove punctuation from each token
  table = str.maketrans('','',string.punctuation)
  # make the tokens as a list comprehension
  tokens = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # make lower case
  tokens = [word.lower() for word in tokens]
  return tokens


In [None]:
tokens = clean_doc(doc)

In [None]:
tokens

['book',
 'i',
 'i',
 'went',
 'down',
 'yesterday',
 'to',
 'the',
 'piraeus',
 'with',
 'glaucon',
 'the',
 'son',
 'of',
 'ariston',
 'that',
 'i',
 'might',
 'offer',
 'up',
 'my',
 'prayers',
 'to',
 'the',
 'goddess',
 'bendis',
 'the',
 'thracian',
 'artemis',
 'and',
 'also',
 'because',
 'i',
 'wanted',
 'to',
 'see',
 'in',
 'what',
 'manner',
 'they',
 'would',
 'celebrate',
 'the',
 'festival',
 'which',
 'was',
 'a',
 'new',
 'thing',
 'i',
 'was',
 'delighted',
 'with',
 'the',
 'procession',
 'of',
 'the',
 'inhabitants',
 'but',
 'that',
 'of',
 'the',
 'thracians',
 'was',
 'equally',
 'if',
 'not',
 'more',
 'beautiful',
 'when',
 'we',
 'had',
 'finished',
 'our',
 'prayers',
 'and',
 'viewed',
 'the',
 'spectacle',
 'we',
 'turned',
 'in',
 'the',
 'direction',
 'of',
 'the',
 'city',
 'and',
 'at',
 'that',
 'instant',
 'polemarchus',
 'the',
 'son',
 'of',
 'cephalus',
 'chanced',
 'to',
 'catch',
 'sight',
 'of',
 'us',
 'from',
 'a',
 'distance',
 'as',
 'we',
 

In [None]:
# First 200 tokens
print("First 200 tokens:",tokens[:200])
# tokens
print("Total no of tokens:",len(tokens))
# vocab
print("Vocab:",len(set(tokens)))

First 200 tokens: ['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'r

Create a list of sequences

In [None]:
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    sequences.append(line)

# Corrected print statement
print("Total sequences: %d" % len(sequences))


Total sequences: 118633


In [None]:
sequences

['book i i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was',
 'i i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted',
 'i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted with',
 'went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what man

Save the sequences into a text file

In [None]:
def save_doc(lines,filename):
  data = '\n'.join(lines)
  file = open(filename,'w')
  file.write(data)
  file.close()

# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences,out_filename)

Take a copy of the file

In [None]:
def load_doc(filename):
  # open the file as read only
  file = open(filename,'r')
  # read all text
  text = file.read()
  # close the file
  file.close()
  return text

#load
ln_filename = 'republic_sequences.txt'
doc = load_doc(ln_filename)
lines = doc.split('\n')

Tokenize and convert the data into sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [None]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'of': 3,
 'to': 4,
 'is': 5,
 'in': 6,
 'he': 7,
 'a': 8,
 'that': 9,
 'be': 10,
 'i': 11,
 'not': 12,
 'which': 13,
 'are': 14,
 'you': 15,
 'they': 16,
 'or': 17,
 'will': 18,
 'said': 19,
 'as': 20,
 'we': 21,
 'but': 22,
 'have': 23,
 'them': 24,
 'his': 25,
 'for': 26,
 'by': 27,
 'who': 28,
 'their': 29,
 'what': 30,
 'then': 31,
 'this': 32,
 'one': 33,
 'if': 34,
 'with': 35,
 'there': 36,
 'all': 37,
 'true': 38,
 'at': 39,
 'when': 40,
 'do': 41,
 'other': 42,
 'has': 43,
 'yes': 44,
 'any': 45,
 'him': 46,
 'no': 47,
 'good': 48,
 'would': 49,
 'may': 50,
 'state': 51,
 'from': 52,
 'man': 53,
 'say': 54,
 'our': 55,
 'only': 56,
 'was': 57,
 'an': 58,
 'must': 59,
 'should': 60,
 'so': 61,
 'more': 62,
 'us': 63,
 'can': 64,
 'on': 65,
 'were': 66,
 'very': 67,
 'now': 68,
 'like': 69,
 'such': 70,
 'replied': 71,
 'just': 72,
 'certainly': 73,
 'than': 74,
 'also': 75,
 'these': 76,
 'men': 77,
 'same': 78,
 'another': 79,
 'about': 80,
 'justice': 8

In [None]:
sequences

[[1046,
  11,
  11,
  1045,
  329,
  7409,
  4,
  1,
  2873,
  35,
  213,
  1,
  261,
  3,
  2251,
  9,
  11,
  179,
  817,
  123,
  92,
  2872,
  4,
  1,
  2249,
  7408,
  1,
  7407,
  7406,
  2,
  75,
  120,
  11,
  1266,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7405,
  1,
  1609,
  13,
  57,
  8,
  549,
  151,
  11,
  57],
 [11,
  11,
  1045,
  329,
  7409,
  4,
  1,
  2873,
  35,
  213,
  1,
  261,
  3,
  2251,
  9,
  11,
  179,
  817,
  123,
  92,
  2872,
  4,
  1,
  2249,
  7408,
  1,
  7407,
  7406,
  2,
  75,
  120,
  11,
  1266,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7405,
  1,
  1609,
  13,
  57,
  8,
  549,
  151,
  11,
  57,
  1147],
 [11,
  1045,
  329,
  7409,
  4,
  1,
  2873,
  35,
  213,
  1,
  261,
  3,
  2251,
  9,
  11,
  179,
  817,
  123,
  92,
  2872,
  4,
  1,
  2249,
  7408,
  1,
  7407,
  7406,
  2,
  75,
  120,
  11,
  1266,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7405,
  1,
  1609,
  13,
  57,
  8,
  549,
  151,
  11,
  57,
  1147,
  35],
 [1045,

# convert it into an array

In [None]:
sequences = array(sequences)

In [None]:
len(sequences)

118633

In [None]:
print(sequences[0])

[1046   11   11 1045  329 7409    4    1 2873   35  213    1  261    3
 2251    9   11  179  817  123   92 2872    4    1 2249 7408    1 7407
 7406    2   75  120   11 1266    4  110    6   30  168   16   49 7405
    1 1609   13   57    8  549  151   11   57]


X and Y

In [None]:
X , y = sequences[:,:-1] , sequences[:,-1]


In [None]:
X

array([[1046,   11,   11, ...,  549,  151,   11],
       [  11,   11, 1045, ...,  151,   11,   57],
       [  11, 1045,  329, ...,   11,   57, 1147],
       ...,
       [ 382,  467,    4, ..., 1044,  414,   13],
       [ 467,    4,   33, ...,  414,   13,   21],
       [   4,   33,   79, ...,   13,   21,   23]])

In [None]:
X[1]

array([  11,   11, 1045,  329, 7409,    4,    1, 2873,   35,  213,    1,
        261,    3, 2251,    9,   11,  179,  817,  123,   92, 2872,    4,
          1, 2249, 7408,    1, 7407, 7406,    2,   75,  120,   11, 1266,
          4,  110,    6,   30,  168,   16,   49, 7405,    1, 1609,   13,
         57,    8,  549,  151,   11,   57])

In [None]:
y[1]

1147

In [None]:
print(X.shape)
print(y.shape)

(118633, 50)
(118633,)


In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
y = to_categorical(y,num_classes=vocab_size)

7410


In [None]:
y.shape

(118633, 7410)

In [None]:
X.shape

(118633, 50)

In [None]:
X.shape[1]

50

Create a model

In [None]:
model = Sequential()
model.add(Embedding(vocab_size,100,input_length= X.shape[1])) # Embedding layer or Input Layer
model.add(LSTM(100,return_sequences=True)) # LSTM 1 Layer
model.add(LSTM(100)) # LSTM 2 layer
model.add(Dense(100 , activation='relu')) # Classification layer
model.add(Dense(vocab_size,activation='softmax')) # Output layer




Compile the model

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

Fit the model

In [None]:
#Checkpoint
filepath="weights-improvement-{epoch:02d}-{accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
hist = model.fit(X,y,batch_size=128,epochs=100)

Epoch 1/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 15ms/step - accuracy: 0.0659 - loss: 6.4426
Epoch 2/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - accuracy: 0.1049 - loss: 5.6872
Epoch 3/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 15ms/step - accuracy: 0.1314 - loss: 5.4426
Epoch 4/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - accuracy: 0.1469 - loss: 5.2653
Epoch 5/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - accuracy: 0.1552 - loss: 5.1510
Epoch 6/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - accuracy: 0.1654 - loss: 5.0281
Epoch 7/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - accuracy: 0.1736 - loss: 4.9282
Epoch 8/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 15ms/step - accuracy: 0.1778 - loss: 4.8337
Epoch 9/100
[1m

Take a copy of the model

In [None]:
#Save the model
model.save('model.h5')



In [None]:
#take a copy of the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

# Inference Pipeline

In [None]:
model = load_model('model.h5')
tokenizer = load(open('tokenizer.pkl', 'rb'))



In [None]:
len(lines[0].split())

51

In [None]:
seq_length = len(lines[0].split())-1

Inference Pipeline
1. Random Text
2. Tokenizer
3. Padding for fixed length
4. Predict
5. Use predicted values in tokenizer to generate the text

In [None]:
import numpy as np

In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == np.argmax(yhat):
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)


# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# select a seed text
seed_text = lines[randint(0,len(lines))]
print("seed_text:" + '\n')
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print("generated_text:" + '\n')
print(generated)



seed_text:

excessive love of these delights they kick and butt at one another with horns and hoofs which are made of iron and they kill one another by reason of their insatiable lust for they fill themselves with that which is not substantial and the part of themselves which they fill is

generated_text:

also unsubstantial and incontinent verily socrates said glaucon i said the students of the state may be expected to be savage in life and after death in order that he has a good deal of chance about them in the first place freedom and divine is not the repayment of


In [None]:
print(len(generated))

265
