In [28]:
import string
import re
import numpy as np
from pickle import dump, load
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Input
from random import randint
from tensorflow.keras.preprocessing.sequence import pad_sequences

**Data Preparation**

In [17]:
def load_doc(filename):
    file = open(filename, 'r', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [18]:
def clean_doc(doc):
    doc = doc.replace('--', ' ')
    tokens = doc.split()
    re_punc = re.compile(f"[{re.escape(string.punctuation)}]")
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w.lower() for w in tokens]
    return tokens

In [19]:
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [20]:
in_filename = '/kaggle/input/the-plato/republic_clean.txt'
doc = load_doc(in_filename)
tokens = clean_doc(doc)
print(f"Total tokens: {len(tokens)}")
print(f"Unique tokens: {len(set(tokens))}")

Total tokens: 117342
Unique tokens: 7323


In [21]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    seq = tokens[i-length : i]
    line = ' '.join(seq)
    sequences.append(line)
print(f"Total sequences: {len(sequences)}")

out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)

Total sequences: 117291


**Train Language Model**

In [22]:
def define_model(vocab_size, seq_length):
    model = Sequential()
    model.add(Input(shape = (seq_length,)))
    model.add(Embedding(vocab_size, 50))
    model.add(LSTM(100, return_sequences = True))
    model.add(LSTM(100))
    model.add(Dense(100, activation = 'relu'))
    model.add(Dense(vocab_size, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model.summary()
    return model

In [23]:
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [24]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [25]:
vocab_size = len(tokenizer.word_index) + 1
sequences = np.array(sequences)
# separate into input and output
x, y = sequences[:, :-1], sequences[:, -1]
y = to_categorical(y, num_classes = vocab_size)
seq_length = x.shape[1]

In [26]:
model = define_model(vocab_size, seq_length)
model.fit(x, y, batch_size = 128, epochs = 100)
model.save('model.h5') # save model to file
dump(tokenizer, open('tokenizer.pkl', 'wb')) # save the tokenizer

Epoch 1/100
[1m917/917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - accuracy: 0.0614 - loss: 6.4439
Epoch 2/100
[1m917/917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.1028 - loss: 5.7122
Epoch 3/100
[1m917/917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.1271 - loss: 5.4772
Epoch 4/100
[1m917/917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.1456 - loss: 5.3005
Epoch 5/100
[1m917/917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.1548 - loss: 5.1762
Epoch 6/100
[1m917/917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.1607 - loss: 5.0818
Epoch 7/100
[1m917/917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.1649 - loss: 4.9802
Epoch 8/100
[1m917/917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.1684 - loss: 4.9183
Epoch 9/100
[1m

In [34]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    for _ in range (n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen = seq_length, truncating = 'pre')
        yhat = model.predict(encoded, verbose = 0)
        yhat = np.argmax(yhat, axis = -1)
        out_word = ''
        for word, index in  tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [35]:
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
# input : line.size() - output_size
seq_length = len(lines[0].split()) - 1

In [32]:
model = load_model('model.h5')
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [40]:
seed_text = lines[randint(0, len(lines))] # select a random line
print(seed_text + '\n')

generated = generate_seq(model, tokenizer, seq_length, seed_text, 75)
print(generated)

philosophy instead of persisting degenerates and receives another character but if philosophy ever finds in the state that perfection which she herself is then will be seen that she is in truth divine and that all other things whether natures of men or institutions are but now i know that you

are right and how can not the same thing which is a lover of accuracy is a mixture of the other and exclusive among the state and the other of the state and the other of the unreasoning anger of his depth is the permission to drink for soon as they are to be reserved for the interests of the state and the other of the state consisted in the argument to prefer to be
