##IMPORTS

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re

In [2]:
dataset = open("corpus.txt", "r", encoding = "utf-8-sig").read()

In [3]:
dataset[:100]

'In the Devoid of Sorrow\nThe Air and my Heart full of Luster\nBehind the Mountains\nThere must be a key'

##PREPROCESSING

In [4]:
# preserves the \n character, unlike the regular .split()
tokens = re.findall(r"\S+|\n", dataset)

In [5]:
tokens[:10]

['In', 'the', 'Devoid', 'of', 'Sorrow', '\n', 'The', 'Air', 'and', 'my']

In [6]:
def clean_text(text):
    # punctuation, except: 
    # ' - because of 'cause and fuckin'
    # & - because of and
    # \n
    text = re.sub(r"[^\n&a-zA-Z']", " ", text)

    # lower-case
    text = text.lower()

    # clean archaisms and other special cases
    text = re.sub(r"'cause ", "because", text)
    text = re.sub(r"shalt", "shall", text)
    text = re.sub(r"kaos", "chaos", text)
    text = re.sub(r"'till", "until", text)
    text = re.sub(r"fuckin'", "fucking", text)
    text = re.sub(r"couldst", "could", text)
    text = re.sub(r"sayeth", "says", text)
    text = re.sub(r"calleth", "calls", text)
    text = re.sub(r"&", " and ", text)
    text = re.sub(r"sathan", "satan", text)
    text = re.sub(r"'em", "them", text)

    # possible parts of other words
    text = re.sub(r"(?<![a-zA-Z])thy(?![a-zA-Z])|thine", "your", text)
    text = re.sub(r"thou|thee|(?<![a-zA-Z])ye(?![a-zA-Z])", "you", text)
    text = re.sub(r"(?<![a-zA-Z])o(?![a-zA-Z])", "oh", text)
    text = re.sub(r"(?<![a-zA-Z])tis(?![a-zA-Z])", "it is", text)
    text = re.sub(r"(?<![a-zA-Z])thru(?![a-zA-Z])", "through", text)
    text = re.sub(r"(?<![a-zA-Z])ov(?![a-zA-Z])", "of", text)
    text = re.sub(r"(?<![a-zA-Z])hast(?![a-zA-Z])", "have", text)

    # double whitespaces
    text = re.sub(r" +", " ", text)
  
    return text

In [7]:
text = [clean_text(token) for token in tokens]

In [8]:
text[:10]

['in', 'the', 'devoid', 'of', 'sorrow', '\n', 'the', 'air', 'and', 'my']

##EXPLORATORY DATA ANALYSIS

In [9]:
# total tokens
len(text)

390866

In [10]:
# unique tokens
len(set(text))

21491

In [None]:
# hapax legomena - outliers - possibly remove?

In [None]:
# vocab analysis - lemmatize

##SEQUENCES

In [15]:
seq_len = 50
X_items = []
y = []

for i in range(0, len(tokens)-seq_len):
	X_items.append(tokens[i:i+seq_len])
	y.append(tokens[i+seq_len])

In [16]:
X = [" ".join(item) for item in X_items]

In [17]:
X[:10]

['In the Devoid of Sorrow \n The Air and my Heart full of Luster \n Behind the Mountains \n There must be a key to the Gates \n Right passed easy Talk \n We can once again walk the Path of Sorrow \n Let the Turn of Search bring you',
 'the Devoid of Sorrow \n The Air and my Heart full of Luster \n Behind the Mountains \n There must be a key to the Gates \n Right passed easy Talk \n We can once again walk the Path of Sorrow \n Let the Turn of Search bring you away',
 'Devoid of Sorrow \n The Air and my Heart full of Luster \n Behind the Mountains \n There must be a key to the Gates \n Right passed easy Talk \n We can once again walk the Path of Sorrow \n Let the Turn of Search bring you away \n',
 'of Sorrow \n The Air and my Heart full of Luster \n Behind the Mountains \n There must be a key to the Gates \n Right passed easy Talk \n We can once again walk the Path of Sorrow \n Let the Turn of Search bring you away \n Two',
 'Sorrow \n The Air and my Heart full of Luster \n Behind the M

In [18]:
y[:10]

['away', '\n', 'Two', 'Shades', 'of', 'the', 'Midnight', 'Fire', '\n', 'In']

In [None]:
# TESTING THE DIMENSIONS

In [20]:
x_test = [re.findall(r"\S+|\n", sequence) for sequence in X]

In [21]:
lengths_x = [len(item) for item in x_test]

In [22]:
lengths_x[:10]

[50, 50, 50, 50, 50, 50, 50, 50, 50, 50]

In [23]:
len(set(lengths_x))

1

In [24]:
y_test = [re.findall(r"\S+|\n", sequence) for sequence in y]

In [25]:
lengths_y = [len(item) for item in y_test]

In [26]:
lengths_y[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [27]:
len(set(lengths_y))

1

##TOKENIZATION

In [28]:
from keras.preprocessing.text import Tokenizer

# to keep the \n
filters_ = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t' 
tokenizer = Tokenizer(filters=filters_)

tokenizer.fit_on_texts(X)

In [30]:
X_emb = tokenizer.texts_to_sequences(X)
y_emb = tokenizer.texts_to_sequences(y)

In [None]:
# TESTING THE DIMENSIONS
# WHYYYYYYY

In [36]:
lengths_X_emb = [len(item) for item in X_emb]

In [38]:
set(lengths_X_emb)

{46, 47, 48, 49, 50, 51, 52, 53, 54, 55}

In [37]:
lengths_y_emb = [len(item) for item in y_emb]

In [39]:
set(lengths_y_emb)

{0, 1, 2, 3, 4}

In [35]:
from tensorflow.keras.utils import to_categorical

vocab_size = len(tokenizer.word_index)

y_categorical = to_categorical(y_emb, num_classes=vocab_size)

ValueError: ignored

##MODEL

In [None]:
from keras.models import Sequential
from keras.layers import Input, Embedding, LSTM, Dense, Dropout

In [None]:
INPUT_SHAPE = seq_len
VOCABULARY_SIZE = len(tokenizer.word_index)
EMBEDDING_DIMENSION = 50
LSTM_UNITS = 50
DROPOUT_RATE = 0.2

In [None]:
rnn = Sequential()

rnn.add(Input(INPUT_SHAPE,))
# +1 here resolves the indexing problem during training
rnn.add(Embedding(VOCABULARY_SIZE+1, EMBEDDING_DIMENSION))
rnn.add(LSTM(LSTM_UNITS, return_sequences=True))
rnn.add(Dense(VOCABULARY_SIZE, activation="softmax"))

In [None]:
rnn.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
rnn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 50)            886750    
                                                                 
 lstm_1 (LSTM)               (None, 50, 50)            20200     
                                                                 
 dense (Dense)               (None, 50, 17734)         904434    
                                                                 
Total params: 1,811,384
Trainable params: 1,811,384
Non-trainable params: 0
_________________________________________________________________


##MODEL TRAINING

##GENERATION