##IMPORTS

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re

In [2]:
dataset = open("corpus.txt", "r", encoding = "utf-8-sig").read()

In [3]:
dataset[:100]

'In the Devoid of Sorrow\nThe Air and my Heart full of Luster\nBehind the Mountains\nThere must be a key'

##PREPROCESSING

In [4]:
def clean_text(text):
    # lower-case
    text = text.lower()

    # clean archaisms and other special cases
    text = re.sub(r"'cause ", "because", text)
    text = re.sub(r"shalt", "shall", text)
    text = re.sub(r"kaos", "chaos", text)
    text = re.sub(r"'till", "until", text)
    text = re.sub(r"fuckin'", "fucking", text)
    text = re.sub(r"couldst", "could", text)
    text = re.sub(r"sayeth", "says", text)
    text = re.sub(r"calleth", "calls", text)
    text = re.sub(r"&", " and ", text)
    text = re.sub(r"sathan", "satan", text)
    text = re.sub(r"'em", "them", text)

    # possible parts of other words
    text = re.sub(r"(?<![a-zA-Z])thy(?![a-zA-Z])|thine", "your", text)
    text = re.sub(r"thou|thee|(?<![a-zA-Z])ye(?![a-zA-Z])", "you", text)
    text = re.sub(r"(?<![a-zA-Z])o(?![a-zA-Z])", "oh", text)
    text = re.sub(r"(?<![a-zA-Z])'tis(?![a-zA-Z])", "it is", text)
    text = re.sub(r"(?<![a-zA-Z])thru(?![a-zA-Z])", "through", text)
    text = re.sub(r"(?<![a-zA-Z])ov(?![a-zA-Z])", "of", text)
    text = re.sub(r"(?<![a-zA-Z])hast(?![a-zA-Z])", "have", text)

    # punctuation, except \n and '
    text = re.sub(r"[^\na-zA-Z']", " ", text)

    # double whitespaces
    text = re.sub(r" +", " ", text)


  
    return text

In [5]:
text = clean_text(dataset)

In [6]:
tokens = re.findall(r"\S+|\n", text)

In [7]:
tokens[600:666]

['we',
 'burnt',
 'them',
 'in',
 'the',
 'purgatory',
 'them',
 'the',
 'children',
 'of',
 'god',
 '\n',
 'barely',
 'forgotten',
 'these',
 'times',
 'are',
 '\n',
 'but',
 'not',
 'for',
 'a',
 'soul',
 'whose',
 'rest',
 "hasn't",
 'been',
 'found',
 '\n',
 'from',
 'beyond',
 'come',
 'the',
 'storms',
 '\n',
 'landscapes',
 'turn',
 'to',
 'ash',
 'before',
 'my',
 'pressured',
 'eyes',
 '\n',
 'nothingness',
 'turns',
 'to',
 'nothingness',
 '\n',
 'and',
 'my',
 'imagination',
 'fades',
 'like',
 'dust',
 'clouds',
 'over',
 '\n',
 'this',
 'deserted',
 'land',
 'feel',
 'to',
 'now',
 'satyricon',
 '\n']

##EXPLORATORY DATA ANALYSIS

In [8]:
# total tokens
len(tokens)

391905

In [9]:
# unique tokens
len(set(tokens))

17559

In [10]:
# hapax legomena - outliers - possibly remove?

In [11]:
# vocab analysis - lemmatize

##SEQUENCES

In [12]:
seq_len = 50
X_items = []
y = []

for i in range(0, len(tokens)-seq_len):
	X_items.append(tokens[i:i+seq_len])
	y.append(tokens[i+seq_len])

In [13]:
X = [" ".join(item) for item in X_items]

In [14]:
X[:5]

['in the devoid of sorrow \n the air and my heart full of luster \n behind the mountains \n there must be a key to the gates \n right passed easy talk \n we can once again walk the path of sorrow \n let the turn of search bring you',
 'the devoid of sorrow \n the air and my heart full of luster \n behind the mountains \n there must be a key to the gates \n right passed easy talk \n we can once again walk the path of sorrow \n let the turn of search bring you away',
 'devoid of sorrow \n the air and my heart full of luster \n behind the mountains \n there must be a key to the gates \n right passed easy talk \n we can once again walk the path of sorrow \n let the turn of search bring you away \n',
 'of sorrow \n the air and my heart full of luster \n behind the mountains \n there must be a key to the gates \n right passed easy talk \n we can once again walk the path of sorrow \n let the turn of search bring you away \n two',
 'sorrow \n the air and my heart full of luster \n behind the m

In [15]:
y[:5]

['away', '\n', 'two', 'shades', 'of']

In [16]:
# TESTING THE DIMENSIONS

In [17]:
x_test = [re.findall(r"\S+|\n", sequence) for sequence in X]

In [18]:
lengths_x = [len(item) for item in x_test]

In [19]:
set(lengths_x)

{50}

In [20]:
y_test = [re.findall(r"\S+|\n", sequence) for sequence in y]

In [21]:
lengths_y = [len(item) for item in y_test]

In [22]:
set(lengths_y)

{1}

##TOKENIZATION

In [23]:
from keras.preprocessing.text import Tokenizer

# to keep the \n
filters_ = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t' 
tokenizer = Tokenizer(filters=filters_)

tokenizer.fit_on_texts(X)

In [24]:
X_emb = tokenizer.texts_to_sequences(X)
y_emb = tokenizer.texts_to_sequences(y)

In [25]:
# TESTING THE DIMENSIONS

In [26]:
lengths_X_emb = [len(item) for item in X_emb]

In [27]:
set(lengths_X_emb)

{50}

In [28]:
lengths_y_emb = [len(item) for item in y_emb]

In [29]:
set(lengths_y_emb)

{1}

In [None]:
from tensorflow.keras.utils import to_categorical
classes = len(tokenizer.word_index)+1

y_categorical = to_categorical(y_emb, num_classes=classes)

In [None]:
y_categorical.shape

##MODEL

In [None]:
from keras.models import Sequential
from keras.layers import Input, Embedding, LSTM, Dense, Dropout

In [None]:
INPUT_SHAPE = seq_len
VOCABULARY_SIZE = len(tokenizer.word_index)
EMBEDDING_DIMENSION = 50
LSTM_UNITS = 50
DROPOUT_RATE = 0.2

In [None]:
rnn = Sequential()

rnn.add(Input(INPUT_SHAPE,))
# +1 here resolves the indexing problem during training
rnn.add(Embedding(VOCABULARY_SIZE+1, EMBEDDING_DIMENSION))
rnn.add(LSTM(LSTM_UNITS, return_sequences=True))
rnn.add(Dense(VOCABULARY_SIZE, activation="softmax"))

In [None]:
rnn.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
rnn.summary()

##MODEL TRAINING

In [None]:
# fixes some errors
X_train = np.array(X_emb, dtype=float)
y_train = np.array(y_categorical, dtype=float)

In [None]:
BATCH_SIZE = 32
EPOCHS = 5

In [None]:
rnn.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

##GENERATION