In [1]:
import numpy as np
import pandas as pd
# from tensorflow import set_random_seed

from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

from numpy.random import seed
# set_random_seed(2)
seed(1)

import string, os

In [2]:
file_path = '001ssb.txt'

In [3]:
file_ = open(file_path, 'r')
data = file_.read()
sentences = data.split('.')
length = int(len(sentences)/2-1)
sentences = sentences[:length]
length

12626

In [4]:
def clean_text(txt):
  txt = ''.join(v for v in txt if v not in string.punctuation).lower()
  txt = txt.encode("utf-8").decode("ascii", 'ignore')
  txt = txt.replace("\n", "")
  return txt

corpus = [clean_text(x) for x in sentences]
print(corpus[:10])
print(len(corpus))

['a game of thrones book one of a song of ice and fire by george r', ' r', ' martin prologue we should start back gared urged as the woods began to grow dark around them', ' the wildlings are dead', ' do the dead frighten you ser waymar royce asked with just the hint of a smile', ' gared did not rise to the bait', ' he was an old man past fifty and he had seen the lordlings come and go', ' dead is dead he said', ' we have no business with the dead', ' are they dead royce asked softly']
12626


In [5]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
  tokenizer.fit_on_texts(corpus)
  total_words = len(tokenizer.word_index) + 1
  input_sequences = []
  for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(token_list)):
      n_gram_sequence = token_list[:i+1]
      input_sequences.append(n_gram_sequence)
  return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)

In [6]:
inp_sequences[:100]

[[3, 990],
 [3, 990, 5],
 [3, 990, 5, 1574],
 [3, 990, 5, 1574, 914],
 [3, 990, 5, 1574, 914, 49],
 [3, 990, 5, 1574, 914, 49, 5],
 [3, 990, 5, 1574, 914, 49, 5, 3],
 [3, 990, 5, 1574, 914, 49, 5, 3, 1214],
 [3, 990, 5, 1574, 914, 49, 5, 3, 1214, 5],
 [3, 990, 5, 1574, 914, 49, 5, 3, 1214, 5, 456],
 [3, 990, 5, 1574, 914, 49, 5, 3, 1214, 5, 456, 2],
 [3, 990, 5, 1574, 914, 49, 5, 3, 1214, 5, 456, 2, 299],
 [3, 990, 5, 1574, 914, 49, 5, 3, 1214, 5, 456, 2, 299, 64],
 [3, 990, 5, 1574, 914, 49, 5, 3, 1214, 5, 456, 2, 299, 64, 2094],
 [3, 990, 5, 1574, 914, 49, 5, 3, 1214, 5, 456, 2, 299, 64, 2094, 885],
 [1575, 5371],
 [1575, 5371, 75],
 [1575, 5371, 75, 190],
 [1575, 5371, 75, 190, 945],
 [1575, 5371, 75, 190, 945, 60],
 [1575, 5371, 75, 190, 945, 60, 601],
 [1575, 5371, 75, 190, 945, 60, 601, 991],
 [1575, 5371, 75, 190, 945, 60, 601, 991, 17],
 [1575, 5371, 75, 190, 945, 60, 601, 991, 17, 1],
 [1575, 5371, 75, 190, 945, 60, 601, 991, 17, 1, 728],
 [1575, 5371, 75, 190, 945, 60, 601, 9

In [7]:
def generate_padded_sequences(input_sequences):
  max_sequence_len = max([len(x) for x in input_sequences])
  input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
  predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
  label = ku.to_categorical(label, num_classes=total_words)
  return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [8]:
max_sequence_len

106

In [9]:
def create_model(max_sequence_len, total_words):
  input_len = max_sequence_len - 1
  model = Sequential()

  model.add(Embedding(total_words, 10, input_length=input_len))

  model.add(LSTM(100))
  model.add(Dropout(0.1))

  model.add(Dense(total_words, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam')
  return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 105, 10)           92210     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 9221)              931321    
                                                                 
Total params: 1067931 (4.07 MB)
Trainable params: 1067931 (4.07 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
model.fit(predictors, label, epochs = 100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7bd3acaabc70>

In [11]:
model

<keras.src.engine.sequential.Sequential at 0x7bd3b19cf370>

In [12]:
def generate_text(seed_text, next_words, model, max_sequence_len):
  for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict([token_list], verbose=0)
    predicted = np.argmax(predicted, axis=1)
    output_words = ""
    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    seed_text += " " + output_word
  return seed_text.title()

In [13]:
# print(generate_text("Tyrion", 200, model, max_sequence_len))
print(generate_text("Kings landing", 500, model, max_sequence_len))

Kings Landing Was The Broken Tower Of The Mountain And The Great Brothers Had A Cavernous Blue Of Least Its Bronze Scales Of Black Age And A Heirs Of Twelve Bread And Menatarms And Blood And Rotting Black Leaves Black Iron Decent Body Armored With Iron Iron Wives And Hundreds Of Wagons
